diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml
index 16602627c43..7298b8b6ab7 100644
--- a/.ci/pnnx.yml
+++ b/.ci/pnnx.yml
@@ -52,6 +52,10 @@ jobs:
torchvision-version: 0.16.0
torchvision-cache-key: '0_16_0'
+ - torch-version: 2.2.1
+ torchvision-version: 0.17.1
+ torchvision-cache-key: '0_17_1'
+
runs-on:
pool-name: docker
container:
@@ -122,6 +126,7 @@ jobs:
- name: test
run: |
export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
+ export LD_LIBRARY_PATH=${{ci.workspace}}/torchvision-${{matrix.torchvision-version}}-install/lib
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
export MKL_ENABLE_INSTRUCTIONS=SSE4_2
@@ -131,8 +136,9 @@ jobs:
- name: python-pnnx
run: |
export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
+ export LD_LIBRARY_PATH=${{ci.workspace}}/torchvision-${{matrix.torchvision-version}}-install/lib
export PNNX_WHEEL_WITHOUT_BUILD=ON
- cd tools/pnnx
- cp build/src/pnnx python/pnnx/
+ cd tools/pnnx/python
+ cp ../build/src/pnnx pnnx/
python3 setup.py install --user
- pytest python/tests/
+ pytest tests
diff --git a/.github/workflows/android-armv7-cpu.yml b/.github/workflows/android-armv7-cpu.yml
index 42f85f60f3a..3cddc846389 100644
--- a/.github/workflows/android-armv7-cpu.yml
+++ b/.github/workflows/android-armv7-cpu.yml
@@ -33,12 +33,12 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: ndk-r16b
run: |
@@ -48,9 +48,9 @@ jobs:
run: |
mkdir build-noneon && cd build-noneon
cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-14 ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-noneon-shared
run: |
mkdir build-noneon-shared && cd build-noneon-shared
cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-armv7-gpu.yml b/.github/workflows/android-armv7-gpu.yml
index 9507d4c2798..d416f7eaf6d 100644
--- a/.github/workflows/android-armv7-gpu.yml
+++ b/.github/workflows/android-armv7-gpu.yml
@@ -37,9 +37,9 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-armv8-cpu.yml b/.github/workflows/android-armv8-cpu.yml
index fa920f7aa19..98deabac44b 100644
--- a/.github/workflows/android-armv8-cpu.yml
+++ b/.github/workflows/android-armv8-cpu.yml
@@ -33,9 +33,9 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-armv8-gpu.yml b/.github/workflows/android-armv8-gpu.yml
index a1cb55104c8..43ff4cee2de 100644
--- a/.github/workflows/android-armv8-gpu.yml
+++ b/.github/workflows/android-armv8-gpu.yml
@@ -37,27 +37,27 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-termux
run: |
mkdir build-termux && cd build-termux
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON -DNCNN_PLATFORM_API=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-android-29
run: |
mkdir build-android-29 && cd build-android-29
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-29 -DNCNN_VULKAN=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-android-29-shared
run: |
mkdir build-android-29-shared && cd build-android-29-shared
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-29 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
android-aarch64-gpu-ndk-r16b:
runs-on: ubuntu-20.04
@@ -72,4 +72,4 @@ jobs:
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
diff --git a/.github/workflows/android-x64-cpu.yml b/.github/workflows/android-x64-cpu.yml
index 55bf4018255..0c98606f6f3 100644
--- a/.github/workflows/android-x64-cpu.yml
+++ b/.github/workflows/android-x64-cpu.yml
@@ -33,9 +33,9 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-x64-gpu.yml b/.github/workflows/android-x64-gpu.yml
index 4c0b6237999..d1260fe7e94 100644
--- a/.github/workflows/android-x64-gpu.yml
+++ b/.github/workflows/android-x64-gpu.yml
@@ -37,9 +37,9 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-x86-cpu.yml b/.github/workflows/android-x86-cpu.yml
index cc49975a422..4dbf9b68cac 100644
--- a/.github/workflows/android-x86-cpu.yml
+++ b/.github/workflows/android-x86-cpu.yml
@@ -33,9 +33,9 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-x86-gpu.yml b/.github/workflows/android-x86-gpu.yml
index 8b24690a765..6186968d316 100644
--- a/.github/workflows/android-x86-gpu.yml
+++ b/.github/workflows/android-x86-gpu.yml
@@ -37,9 +37,9 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
diff --git a/.github/workflows/code-format.yml b/.github/workflows/code-format.yml
index 8051371d3e5..c996e51c08e 100644
--- a/.github/workflows/code-format.yml
+++ b/.github/workflows/code-format.yml
@@ -19,7 +19,7 @@ jobs:
- name: cache-clang-format
id: cache-clang-format
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: clang-format-install
key: clang-format-install-4
diff --git a/.github/workflows/ios-arm64-gpu.yml b/.github/workflows/ios-arm64-gpu.yml
index 907f466c386..25f8cf4ad35 100644
--- a/.github/workflows/ios-arm64-gpu.yml
+++ b/.github/workflows/ios-arm64-gpu.yml
@@ -44,7 +44,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-ios-install-20230504
diff --git a/.github/workflows/ios-cpu.yml b/.github/workflows/ios-cpu.yml
index 488b5fe88f7..501eac3377d 100644
--- a/.github/workflows/ios-cpu.yml
+++ b/.github/workflows/ios-cpu.yml
@@ -40,7 +40,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-ios-install-20230504
diff --git a/.github/workflows/ios-simulator-gpu.yml b/.github/workflows/ios-simulator-gpu.yml
index 9a26c0ef0b4..4babdb4e532 100644
--- a/.github/workflows/ios-simulator-gpu.yml
+++ b/.github/workflows/ios-simulator-gpu.yml
@@ -44,7 +44,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-ios-simulator-install-20230504
diff --git a/.github/workflows/ios-simulator.yml b/.github/workflows/ios-simulator.yml
index 7bb2a861f9b..1d550638313 100644
--- a/.github/workflows/ios-simulator.yml
+++ b/.github/workflows/ios-simulator.yml
@@ -42,7 +42,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-ios-simulator-install-20230504
diff --git a/.github/workflows/linux-aarch64-cpu-gcc.yml b/.github/workflows/linux-aarch64-cpu-gcc.yml
index a791da6c26a..0c1032bf9c0 100644
--- a/.github/workflows/linux-aarch64-cpu-gcc.yml
+++ b/.github/workflows/linux-aarch64-cpu-gcc.yml
@@ -36,7 +36,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-aarch64-install-20220502-ubuntu-2004-2
@@ -57,7 +57,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: aarch64-gnu-toolchain
@@ -69,34 +69,34 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_ARM82DOT=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
- name: build-noint8
run: |
mkdir build-noint8 && cd build-noint8
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_ARM82DOT=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-noint8
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build-noint8
- TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
- name: build-simplestl-simplemath
run: |
mkdir build-simplestl-simplemath && cd build-simplestl-simplemath
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu-c.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-simplestl-simplemath
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build-simplestl-simplemath
- TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
linux-gcc-arm82:
runs-on: ubuntu-20.04
@@ -105,7 +105,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-aarch64-install-20220502-ubuntu-2004-2
@@ -126,7 +126,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: aarch64-gnu-toolchain
@@ -138,23 +138,23 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
- name: build-noint8
run: |
mkdir build-noint8 && cd build-noint8
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-noint8
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build-noint8
- TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
linux-gcc-arm86:
runs-on: ubuntu-22.04
@@ -163,7 +163,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-aarch64-install-20230717
@@ -184,7 +184,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: aarch64-gnu-toolchain
@@ -196,9 +196,9 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-arm-cpu-gcc.yml b/.github/workflows/linux-arm-cpu-gcc.yml
index 8222586f129..19d9c1cb370 100644
--- a/.github/workflows/linux-arm-cpu-gcc.yml
+++ b/.github/workflows/linux-arm-cpu-gcc.yml
@@ -38,7 +38,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-arm-install-20220502-2
@@ -59,7 +59,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: arm-gnu-toolchain
@@ -71,23 +71,23 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc)
- name: build-noint8
run: |
mkdir build-noint8 && cd build-noint8
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-noint8
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build-noint8
- TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc)
linux-gcc-armhf:
runs-on: ubuntu-20.04
@@ -96,7 +96,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-arm-install-20220502-2
@@ -117,7 +117,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: arm-gnu-toolchain
@@ -129,23 +129,23 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)
- name: build-noint8
run: |
mkdir build-noint8 && cd build-noint8
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-noint8
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build-noint8
- TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)
linux-gcc-armhf-vfpv3-d16:
runs-on: ubuntu-20.04
@@ -154,7 +154,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-arm-install-20220502-2
@@ -175,7 +175,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: arm-gnu-toolchain
@@ -187,20 +187,20 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)
- name: build-noint8
run: |
mkdir build-noint8 && cd build-noint8
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-noint8
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build-noint8
- TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-mips-cpu-gcc.yml b/.github/workflows/linux-mips-cpu-gcc.yml
index 7265d2ce0ee..f6e1e74792c 100644
--- a/.github/workflows/linux-mips-cpu-gcc.yml
+++ b/.github/workflows/linux-mips-cpu-gcc.yml
@@ -38,7 +38,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-mipsel-install-20220502-2
@@ -59,7 +59,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mipsel-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: mipsel-gnu-toolchain
@@ -70,13 +70,13 @@ jobs:
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsel-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsel-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsel-linux-gnu" ctest --output-on-failure -j $(nproc)
linux-gcc-mipsisa32r6el:
runs-on: ubuntu-20.04
@@ -85,7 +85,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-mipsel-install-20220502-2
@@ -106,7 +106,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mipsel-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: mipsisa32r6el-gnu-toolchain
@@ -117,10 +117,10 @@ jobs:
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-mips64-cpu-gcc.yml b/.github/workflows/linux-mips64-cpu-gcc.yml
index 5ca70798838..890f1054d5b 100644
--- a/.github/workflows/linux-mips64-cpu-gcc.yml
+++ b/.github/workflows/linux-mips64-cpu-gcc.yml
@@ -38,7 +38,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-mips64el-install-20220502-2
@@ -59,7 +59,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: mips64el-gnuabi64-toolchain
@@ -70,13 +70,13 @@ jobs:
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mips64el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mips64el-linux-gnuabi64" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mips64el-linux-gnuabi64" ctest --output-on-failure -j $(nproc)
linux-gcc-mipsisa64r6el:
runs-on: ubuntu-20.04
@@ -85,7 +85,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-mips64el-install-20220502-4
@@ -118,7 +118,7 @@ jobs:
patch -p1 -i 0005-target-mips-Fix-FTRUNC_S-and-FTRUNC_U-trans-helper.patch
patch -p1 -i 0006-target-mips-Fix-store-adress-of-high-64bit-in-helper.patch
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: mipsisa64r6el-gnuabi64-toolchain
@@ -129,10 +129,10 @@ jobs:
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-ppc64-cpu-gcc.yml b/.github/workflows/linux-ppc64-cpu-gcc.yml
index 88fdccee092..834bfb56952 100644
--- a/.github/workflows/linux-ppc64-cpu-gcc.yml
+++ b/.github/workflows/linux-ppc64-cpu-gcc.yml
@@ -34,7 +34,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-ppc-install-20220502-2
@@ -55,7 +55,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: powerpc-gnu-toolchain
@@ -66,13 +66,13 @@ jobs:
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-ppc TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-ppc TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc-linux-gnu" ctest --output-on-failure -j $(nproc)
linux-gcc-ppc64le:
runs-on: ubuntu-20.04
@@ -81,7 +81,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-ppc64le-install-20220502-2
@@ -102,7 +102,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc64le-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: powerpc64le-gnu-toolchain
@@ -113,13 +113,13 @@ jobs:
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc64le-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j $(nproc)
linux-gcc-power8le-vsx:
runs-on: ubuntu-20.04
@@ -128,7 +128,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-ppc64le-install-20220502-2
@@ -149,7 +149,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc64le-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: powerpc64le-gnu-toolchain
@@ -160,13 +160,13 @@ jobs:
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/power8le-linux-gnu-vsx.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power8_v2.0" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power8_v2.0" ctest --output-on-failure -j $(nproc)
linux-gcc-power9le-vsx:
runs-on: ubuntu-20.04
steps:
@@ -174,7 +174,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-ppc64le-install-20220502-2
@@ -195,7 +195,7 @@ jobs:
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc64le-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: powerpc64le-gnu-toolchain
@@ -206,10 +206,10 @@ jobs:
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/power9le-linux-gnu-vsx.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power9_v2.0" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power9_v2.0" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-riscv64-cpu-gcc.yml b/.github/workflows/linux-riscv64-cpu-gcc.yml
index c0e8ca88b0d..cfd9685b800 100644
--- a/.github/workflows/linux-riscv64-cpu-gcc.yml
+++ b/.github/workflows/linux-riscv64-cpu-gcc.yml
@@ -38,7 +38,7 @@ jobs:
- name: cache-qemu
id: cache-qemu
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: qemu-install
key: qemu-riscv64-install-20220502-4
@@ -61,7 +61,7 @@ jobs:
wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
- make -j2
+ make -j$(nproc)
make install
- name: riscv64-gnu-toolchain
@@ -72,13 +72,13 @@ jobs:
- name: configure
run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
cd build
- TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j $(nproc)
linux-gcc-riscv64-c906:
runs-on: [self-hosted, linux, centos]
@@ -106,7 +106,7 @@ jobs:
#- name: cache-qemu
#id: cache-qemu
- #uses: actions/cache@v3
+ #uses: actions/cache@v4
#with:
#path: qemu-install
#key: qemu-riscv64-install-20220502-3
@@ -134,7 +134,7 @@ jobs:
#- name: cache-riscv
#id: cache-riscv
- #uses: actions/cache@v3
+ #uses: actions/cache@v4
#with:
#path: rv64gcv-install-next
#key: rv64gcv-linux-install-20210504
diff --git a/.github/workflows/linux-x64-cpu-clang-python.yml b/.github/workflows/linux-x64-cpu-clang-python.yml
index 9684fcd2e3a..8e6f6718f2f 100644
--- a/.github/workflows/linux-x64-cpu-clang-python.yml
+++ b/.github/workflows/linux-x64-cpu-clang-python.yml
@@ -51,7 +51,7 @@ jobs:
CXX: clang++
run: mkdir build && cd build && cmake -DNCNN_PYTHON=ON -DNCNN_DISABLE_RTTI=OFF -DNCNN_DISABLE_EXCEPTION=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- name: build
- run: cmake --build build -j 2
+ run: cmake --build build -j $(nproc)
- name: install python
run: cd python && pip install .
- name: test
diff --git a/.github/workflows/linux-x64-cpu-clang.yml b/.github/workflows/linux-x64-cpu-clang.yml
index b03c2e5a8e4..185a3642caa 100644
--- a/.github/workflows/linux-x64-cpu-clang.yml
+++ b/.github/workflows/linux-x64-cpu-clang.yml
@@ -50,9 +50,9 @@ jobs:
run: |
mkdir build-sse2 && cd build-sse2
cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-sse2
- run: cd build-sse2 && ctest --output-on-failure -j 2
+ run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
- name: build-shared
env:
CC: clang
@@ -60,7 +60,7 @@ jobs:
run: |
mkdir build-shared && cd build-shared
cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-avx2
env:
CC: clang
@@ -68,9 +68,9 @@ jobs:
run: |
mkdir build-avx2 && cd build-avx2
cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-avx2
- run: cd build-avx2 && ctest --output-on-failure -j 2
+ run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
- name: build-avx
env:
CC: clang
@@ -78,9 +78,9 @@ jobs:
run: |
mkdir build-avx && cd build-avx
cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-avx
- run: cd build-avx && ctest --output-on-failure -j 2
+ run: cd build-avx && ctest --output-on-failure -j $(nproc)
- name: build-avx1-2
env:
CC: clang
@@ -88,9 +88,9 @@ jobs:
run: |
mkdir build-avx1-2 && cd build-avx1-2
cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-avx1-2
- run: cd build-avx1-2 && ctest --output-on-failure -j 2
+ run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
- name: build-noint8
env:
CC: clang
@@ -98,9 +98,9 @@ jobs:
run: |
mkdir build-noint8 && cd build-noint8
cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-noint8
- run: cd build-noint8 && ctest --output-on-failure -j 2
+ run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
linux-clang-simplestl:
runs-on: ubuntu-latest
@@ -113,9 +113,9 @@ jobs:
run: |
mkdir build-simplestl && cd build-simplestl
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-simplestl
- run: cd build-simplestl && ctest --output-on-failure -j 2
+ run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
- name: build-simplestl-simpleomp
env:
CC: clang
@@ -123,6 +123,6 @@ jobs:
run: |
mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-simplestl-simpleomp
- run: cd build-simplestl-simpleomp && ctest --output-on-failure -j 2
+ run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-x64-cpu-gcc-musl.yml b/.github/workflows/linux-x64-cpu-gcc-musl.yml
index d18c9cbc215..cf3d2087d20 100644
--- a/.github/workflows/linux-x64-cpu-gcc-musl.yml
+++ b/.github/workflows/linux-x64-cpu-gcc-musl.yml
@@ -56,12 +56,12 @@ jobs:
run: |
mkdir build && cd build
cmake -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test
shell: alpine.sh {0}
- run: cd build && ctest --output-on-failure -j 2
+ run: cd build && ctest --output-on-failure -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
diff --git a/.github/workflows/linux-x64-cpu-gcc-san.yml b/.github/workflows/linux-x64-cpu-gcc-san.yml
index ad266652929..8a52096461f 100644
--- a/.github/workflows/linux-x64-cpu-gcc-san.yml
+++ b/.github/workflows/linux-x64-cpu-gcc-san.yml
@@ -35,8 +35,8 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_ASAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test
run: |
cd build
- ctest --output-on-failure -j 2
+ ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-x64-cpu-gcc-sde.yml b/.github/workflows/linux-x64-cpu-gcc-sde.yml
index ca0f777a017..eb680173743 100644
--- a/.github/workflows/linux-x64-cpu-gcc-sde.yml
+++ b/.github/workflows/linux-x64-cpu-gcc-sde.yml
@@ -42,7 +42,7 @@ jobs:
- name: gcc12
run: sudo apt-get install gcc-12 g++-12
- name: Setup SDE binaries
- uses: petarpetrovt/setup-sde@v2.3
+ uses: petarpetrovt/setup-sde@v2.4
- name: build-avx512-spr
env:
CC: gcc-12
@@ -50,8 +50,8 @@ jobs:
run: |
mkdir build-avx512-spr && cd build-avx512-spr
cmake -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-avx512-spr
run: |
cd build-avx512-spr
- TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-x64-cpu-gcc.yml b/.github/workflows/linux-x64-cpu-gcc.yml
index 6d1a41a15b9..ab2185be3e7 100644
--- a/.github/workflows/linux-x64-cpu-gcc.yml
+++ b/.github/workflows/linux-x64-cpu-gcc.yml
@@ -47,42 +47,42 @@ jobs:
run: |
mkdir build-sse2 && cd build-sse2
cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-sse2
- run: cd build-sse2 && ctest --output-on-failure -j 2
+ run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-avx2
run: |
mkdir build-avx2 && cd build-avx2
cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-avx2
- run: cd build-avx2 && ctest --output-on-failure -j 2
+ run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
- name: build-avx
run: |
mkdir build-avx && cd build-avx
cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-avx
- run: cd build-avx && ctest --output-on-failure -j 2
+ run: cd build-avx && ctest --output-on-failure -j $(nproc)
- name: build-avx1-2
run: |
mkdir build-avx1-2 && cd build-avx1-2
cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-avx1-2
- run: cd build-avx1-2 && ctest --output-on-failure -j 2
+ run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
- name: build-noint8
run: |
mkdir build-noint8 && cd build-noint8
cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-noint8
- run: cd build-noint8 && ctest --output-on-failure -j 2
+ run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
linux-gcc-cpp03-nostdio-nostring-simplestl:
runs-on: ubuntu-20.04
@@ -92,28 +92,28 @@ jobs:
run: |
mkdir build-nostdio && cd build-nostdio
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-nostdio
- run: cd build-nostdio && ctest --output-on-failure -j 2
+ run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
- name: build-nostdio-nostring
run: |
mkdir build-nostdio-nostring && cd build-nostdio-nostring
cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-simplestl
run: |
mkdir build-simplestl && cd build-simplestl
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-simplestl
- run: cd build-simplestl && ctest --output-on-failure -j 2
+ run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
- name: build-simplestl-simpleomp
run: |
mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-simplestl-simpleomp
- run: cd build-simplestl-simpleomp && ctest --output-on-failure -j 2
+ run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
linux-gcc-avx512:
runs-on: [self-hosted, linux, t4]
diff --git a/.github/workflows/linux-x64-gpu-clang-python.yml b/.github/workflows/linux-x64-gpu-clang-python.yml
index 12cd441ad20..ea9232bcfc4 100644
--- a/.github/workflows/linux-x64-gpu-clang-python.yml
+++ b/.github/workflows/linux-x64-gpu-clang-python.yml
@@ -40,7 +40,7 @@ jobs:
submodules: true
- name: cache-swiftshader
id: cache-swiftshader
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: swiftshader-install
key: swiftshader-linux-install-20230420
@@ -62,7 +62,7 @@ jobs:
cd swiftshader
mkdir -p build; cd build
cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
mkdir $GITHUB_WORKSPACE/swiftshader-install
cp Linux/* $GITHUB_WORKSPACE/swiftshader-install
- name: set up python ${{ matrix.python-version }}
@@ -80,7 +80,7 @@ jobs:
run: |
mkdir build && cd build
cmake -DNCNN_VULKAN=ON -DNCNN_PYTHON=ON -DNCNN_DISABLE_RTTI=OFF -DNCNN_DISABLE_EXCEPTION=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: install python
run: cd python && pip install .
- name: test
diff --git a/.github/workflows/linux-x64-gpu-clang.yml b/.github/workflows/linux-x64-gpu-clang.yml
index e5eecc37d7b..8ab7e6ae961 100644
--- a/.github/workflows/linux-x64-gpu-clang.yml
+++ b/.github/workflows/linux-x64-gpu-clang.yml
@@ -43,7 +43,7 @@ jobs:
submodules: true
- name: cache-swiftshader
id: cache-swiftshader
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: swiftshader-install
key: swiftshader-linux-install-20230420
diff --git a/.github/workflows/linux-x64-gpu-gcc.yml b/.github/workflows/linux-x64-gpu-gcc.yml
index 7d25327eaed..55eb9ff87f2 100644
--- a/.github/workflows/linux-x64-gpu-gcc.yml
+++ b/.github/workflows/linux-x64-gpu-gcc.yml
@@ -43,7 +43,7 @@ jobs:
submodules: true
- name: cache-swiftshader
id: cache-swiftshader
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: swiftshader-install
key: swiftshader-linux-install-20230420
@@ -98,12 +98,12 @@ jobs:
run: |
mkdir build && cd build
cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake ..
- cmake --build . -j 4
+ cmake --build . -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 4
+ cmake --build . -j $(nproc)
linux-gcc-gpu-t4:
runs-on: [self-hosted, linux, t4]
diff --git a/.github/workflows/linux-x86-cpu-clang.yml b/.github/workflows/linux-x86-cpu-clang.yml
index 52ef4e969b0..2ce454c36f4 100644
--- a/.github/workflows/linux-x86-cpu-clang.yml
+++ b/.github/workflows/linux-x86-cpu-clang.yml
@@ -44,9 +44,9 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test
- run: cd build && ctest --output-on-failure -j 2
+ run: cd build && ctest --output-on-failure -j $(nproc)
- name: build-shared
env:
CC: clang
@@ -54,7 +54,7 @@ jobs:
run: |
mkdir build-shared && cd build-shared
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-noint8
env:
CC: clang
@@ -62,6 +62,6 @@ jobs:
run: |
mkdir build-noint8 && cd build-noint8
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-noint8
- run: cd build-noint8 && ctest --output-on-failure -j 2
+ run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-x86-cpu-gcc.yml b/.github/workflows/linux-x86-cpu-gcc.yml
index 3dda6701725..1d88eb3a840 100644
--- a/.github/workflows/linux-x86-cpu-gcc.yml
+++ b/.github/workflows/linux-x86-cpu-gcc.yml
@@ -41,25 +41,25 @@ jobs:
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test
- run: cd build && ctest --output-on-failure -j 2
+ run: cd build && ctest --output-on-failure -j $(nproc)
- name: build-nosse
run: |
mkdir build-nosse && cd build-nosse
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-nosse
- run: cd build-nosse && ctest --output-on-failure -j 2
+ run: cd build-nosse && ctest --output-on-failure -j $(nproc)
- name: build-shared
run: |
mkdir build-shared && cd build-shared
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: build-noint8
run: |
mkdir build-noint8 && cd build-noint8
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-noint8
- run: cd build-noint8 && ctest --output-on-failure -j 2
+ run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/mac-catalyst-arm64-cpu.yml b/.github/workflows/mac-catalyst-arm64-cpu.yml
index 52f002897b6..4a4b5bae9ed 100644
--- a/.github/workflows/mac-catalyst-arm64-cpu.yml
+++ b/.github/workflows/mac-catalyst-arm64-cpu.yml
@@ -38,7 +38,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-mac-catalyst-install-20230504
diff --git a/.github/workflows/mac-catalyst-arm64-gpu.yml b/.github/workflows/mac-catalyst-arm64-gpu.yml
index ff998648772..b1141287176 100644
--- a/.github/workflows/mac-catalyst-arm64-gpu.yml
+++ b/.github/workflows/mac-catalyst-arm64-gpu.yml
@@ -42,7 +42,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-mac-catalyst-install-20230504
diff --git a/.github/workflows/mac-catalyst-x64-cpu.yml b/.github/workflows/mac-catalyst-x64-cpu.yml
index a21bb2ce8af..ce37229fcb3 100644
--- a/.github/workflows/mac-catalyst-x64-cpu.yml
+++ b/.github/workflows/mac-catalyst-x64-cpu.yml
@@ -46,7 +46,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-mac-catalyst-install-20230504
diff --git a/.github/workflows/mac-catalyst-x64-gpu.yml b/.github/workflows/mac-catalyst-x64-gpu.yml
index 13ac747f212..4dabc202a78 100644
--- a/.github/workflows/mac-catalyst-x64-gpu.yml
+++ b/.github/workflows/mac-catalyst-x64-gpu.yml
@@ -50,7 +50,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-mac-catalyst-install-20230504
diff --git a/.github/workflows/macos-arm64-cpu.yml b/.github/workflows/macos-arm64-cpu.yml
index 69d7518566e..09351c2f08d 100644
--- a/.github/workflows/macos-arm64-cpu.yml
+++ b/.github/workflows/macos-arm64-cpu.yml
@@ -39,7 +39,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-macos-install-20230504
diff --git a/.github/workflows/macos-arm64-gpu.yml b/.github/workflows/macos-arm64-gpu.yml
index 3decda70cf8..1dbbe31ec32 100644
--- a/.github/workflows/macos-arm64-gpu.yml
+++ b/.github/workflows/macos-arm64-gpu.yml
@@ -43,7 +43,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-macos-install-20230504
diff --git a/.github/workflows/macos-x64-cpu-python.yml b/.github/workflows/macos-x64-cpu-python.yml
index 9aa7fb2aa58..6d048826064 100644
--- a/.github/workflows/macos-x64-cpu-python.yml
+++ b/.github/workflows/macos-x64-cpu-python.yml
@@ -54,7 +54,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-macos-install-20230504
diff --git a/.github/workflows/macos-x64-cpu.yml b/.github/workflows/macos-x64-cpu.yml
index f7e21b5ad1d..6db56205b70 100644
--- a/.github/workflows/macos-x64-cpu.yml
+++ b/.github/workflows/macos-x64-cpu.yml
@@ -50,7 +50,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-macos-install-20230504
diff --git a/.github/workflows/macos-x64-gpu.yml b/.github/workflows/macos-x64-gpu.yml
index 83ad2dc79fd..bfb2ae5805b 100644
--- a/.github/workflows/macos-x64-gpu.yml
+++ b/.github/workflows/macos-x64-gpu.yml
@@ -53,7 +53,7 @@ jobs:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-macos-install-20230504
@@ -107,7 +107,7 @@ jobs:
- name: cache-swiftshader
id: cache-swiftshader
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: swiftshader-install
key: swiftshader-macos-install-20230420
diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml
index 208c3288fe0..38f28e7b1d1 100644
--- a/.github/workflows/release-python.yml
+++ b/.github/workflows/release-python.yml
@@ -37,28 +37,29 @@ jobs:
- uses: actions/upload-artifact@v4
with:
+ name: sdist
path: dist/*.tar.gz
build_wheels:
- name: ${{ matrix.arch }} ${{ matrix.build }} on ${{ matrix.os }}
+ name: ${{ matrix.arch }} ${{ matrix.build_id }} on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
include:
- - { os: ubuntu-20.04, arch: x86_64, build: 'cp*-manylinux*' }
- - { os: ubuntu-20.04, arch: x86_64, build: 'cp*-musllinux*' }
- - { os: ubuntu-20.04, arch: x86_64, build: 'pp*' }
- - { os: ubuntu-20.04, arch: i686, build: 'cp*-manylinux*' }
- - { os: ubuntu-20.04, arch: i686, build: 'cp*-musllinux*' }
- - { os: ubuntu-20.04, arch: i686, build: 'pp*' }
- - { os: windows-2019, arch: x86, build: 'cp*' }
- - { os: windows-2019, arch: AMD64, build: 'cp*' }
- - { os: windows-2019, arch: AMD64, build: 'pp*' }
- - { os: windows-2019, arch: ARM64, build: 'cp*' }
- - { os: macos-latest, arch: x86_64, build: 'cp*' }
- - { os: macos-latest, arch: x86_64, build: 'pp*' }
- - { os: macos-latest, arch: arm64, build: 'cp*' }
+ - { os: ubuntu-20.04, arch: x86_64, build: 'cp*-manylinux*', build_id: cp-manylinux }
+ - { os: ubuntu-20.04, arch: x86_64, build: 'cp*-musllinux*', build_id: cp-musllinux }
+ - { os: ubuntu-20.04, arch: x86_64, build: 'pp*', build_id: pp }
+ - { os: ubuntu-20.04, arch: i686, build: 'cp*-manylinux*', build_id: cp-manylinux }
+ - { os: ubuntu-20.04, arch: i686, build: 'cp*-musllinux*', build_id: cp-musllinux }
+ - { os: ubuntu-20.04, arch: i686, build: 'pp*', build_id: pp }
+ - { os: windows-2019, arch: x86, build: 'cp*', build_id: cp }
+ - { os: windows-2019, arch: AMD64, build: 'cp*', build_id: cp }
+ - { os: windows-2019, arch: AMD64, build: 'pp*', build_id: pp }
+ - { os: windows-2019, arch: ARM64, build: 'cp*', build_id: cp }
+ - { os: macos-latest, arch: x86_64, build: 'cp*', build_id: cp }
+ - { os: macos-latest, arch: x86_64, build: 'pp*', build_id: pp }
+ - { os: macos-latest, arch: arm64, build: 'cp*', build_id: cp }
steps:
- uses: actions/checkout@v4
@@ -72,24 +73,24 @@ jobs:
# build wheels for ubuntu-20.04
- name: Build wheels for ubuntu
if: matrix.os == 'ubuntu-20.04'
- uses: pypa/cibuildwheel@v2.16.2
+ uses: pypa/cibuildwheel@v2.17.0
env:
CIBW_ARCHS_LINUX: ${{ matrix.arch }}
CIBW_BUILD: ${{ matrix.build }}
CIBW_BUILD_VERBOSITY: 1
- CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2
+ CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
with:
output-dir: wheelhouse
# build wheels for windows-2019
- name: Build wheels for windows
if: matrix.os == 'windows-2019' && (matrix.arch == 'AMD64' || matrix.arch == 'x86')
- uses: pypa/cibuildwheel@v2.16.2
+ uses: pypa/cibuildwheel@v2.17.0
env:
CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
CIBW_BUILD: ${{ matrix.build }}
CIBW_BUILD_VERBOSITY: 1
- CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=2
+ CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=4
CIBW_BEFORE_BUILD: pip install delvewheel
CIBW_REPAIR_WHEEL_COMMAND: delvewheel repair -w {dest_dir} {wheel}
with:
@@ -97,12 +98,12 @@ jobs:
- name: Build wheels for windows ARM64
if: matrix.os == 'windows-2019' && matrix.arch == 'ARM64'
- uses: pypa/cibuildwheel@v2.16.2
+ uses: pypa/cibuildwheel@v2.17.0
env:
CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
CIBW_BUILD: ${{ matrix.build }}
CIBW_BUILD_VERBOSITY: 1
- CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=2
+ CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=4
CIBW_BEFORE_BUILD: pip install delvewheel
CIBW_REPAIR_WHEEL_COMMAND: delvewheel repair -w {dest_dir} {wheel} --no-dll "msvcp140.dll;vcomp140.dll"
with:
@@ -112,7 +113,7 @@ jobs:
- name: cache-openmp for macos
if: matrix.os == 'macos-latest'
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-macos-install-20230504
@@ -178,12 +179,12 @@ jobs:
- name: Build wheels for macos x86_64
if: matrix.os == 'macos-latest' && matrix.arch == 'x86_64'
- uses: pypa/cibuildwheel@v2.16.2
+ uses: pypa/cibuildwheel@v2.17.0
env:
CIBW_ARCHS_MACOS: ${{ matrix.arch }}
CIBW_BUILD: ${{ matrix.build }}
CIBW_BUILD_VERBOSITY: 1
- CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2
+ CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3
CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC ARCHS="x86_64"
DEPLOYMENT_TARGET="10.9" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp"
@@ -195,12 +196,12 @@ jobs:
- name: Build wheels for macos arm64
if: matrix.os == 'macos-latest' && matrix.arch == 'arm64'
- uses: pypa/cibuildwheel@v2.16.2
+ uses: pypa/cibuildwheel@v2.17.0
env:
CIBW_ARCHS_MACOS: ${{ matrix.arch }}
CIBW_BUILD: ${{ matrix.build }}
CIBW_BUILD_VERBOSITY: 1
- CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2
+ CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3
CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC_ARM64 ARCHS="arm64"
DEPLOYMENT_TARGET="11.0" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp"
@@ -221,30 +222,19 @@ jobs:
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
+ name: wheels-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.build_id }}
path: wheelhouse/*.whl
- build_wheels_qemu:
- name: ${{ matrix.arch }} ${{ matrix.build }}
+ build_wheels_qemu_cp:
+ name: ${{ matrix.arch }} ${{ matrix.build_cp }} ${{ matrix.build_sub }}
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
arch: [aarch64, ppc64le, s390x]
- build: [ 'cp36-manylinux*', 'cp37-manylinux*', 'cp38-manylinux*',
- 'cp39-manylinux*', 'cp310-manylinux*', 'cp311-manylinux*',
- 'cp312-manylinux*', 'cp36-musllinux*', 'cp37-musllinux*',
- 'cp38-musllinux*', 'cp39-musllinux*', 'cp310-musllinux*',
- 'cp311-musllinux*', 'cp312-musllinux*' ]
- include:
- - arch: aarch64
- build: 'pp37-*'
- - arch: aarch64
- build: 'pp38-*'
- - arch: aarch64
- build: 'pp39-*'
- - arch: aarch64
- build: 'pp310-*'
+ build_cp: [cp36, cp37, cp38, cp39, cp310, cp311, cp312]
+ build_sub: [manylinux, musllinux]
steps:
- uses: actions/checkout@v4
@@ -261,12 +251,60 @@ jobs:
platforms: all
- name: Build wheels for manylinux with qemu
- uses: pypa/cibuildwheel@v2.16.2
+ uses: pypa/cibuildwheel@v2.17.0
env:
CIBW_ARCHS_LINUX: ${{ matrix.arch }}
- CIBW_BUILD: ${{ matrix.build }}
+ CIBW_BUILD: ${{ matrix.build_cp }}-${{ matrix.build_sub }}*
+ CIBW_BUILD_VERBOSITY: 1
+ CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
+ with:
+ output-dir: wheelhouse
+
+ - name: Show files
+ run: ls -lh wheelhouse
+ shell: bash
+
+ - name: Verify clean directory
+ run: git diff --exit-code
+ shell: bash
+
+ - name: Upload wheels
+ uses: actions/upload-artifact@v4
+ with:
+ name: wheels_qemu_cp-${{ matrix.arch }}-${{ matrix.build_cp }}-${{ matrix.build_sub }}
+ path: wheelhouse/*.whl
+
+ build_wheels_qemu_pp:
+ name: ${{ matrix.arch }} ${{ matrix.build_pp }}
+ runs-on: ubuntu-20.04
+
+ strategy:
+ fail-fast: false
+ matrix:
+ arch: [aarch64]
+ build_pp: [pp37, pp38, pp39, pp310]
+
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+
+ - uses: actions/setup-python@v5
+ with:
+ python-version: '3.x'
+
+ - name: Set up QEMU
+ uses: docker/setup-qemu-action@v3
+ with:
+ platforms: all
+
+ - name: Build wheels for manylinux with qemu
+ uses: pypa/cibuildwheel@v2.17.0
+ env:
+ CIBW_ARCHS_LINUX: ${{ matrix.arch }}
+ CIBW_BUILD: ${{ matrix.build_pp }}-*
CIBW_BUILD_VERBOSITY: 1
- CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2
+ CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
with:
output-dir: wheelhouse
@@ -281,13 +319,14 @@ jobs:
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
+ name: wheels_qemu_pp-${{ matrix.arch }}-${{ matrix.build_pp }}
path: wheelhouse/*.whl
upload_all:
permissions:
contents: none
name: Upload
- needs: [build_wheels, build_wheels_qemu, build_sdist]
+ needs: [build_wheels, build_wheels_qemu_cp, build_wheels_qemu_pp, build_sdist]
runs-on: ubuntu-latest
steps:
@@ -297,8 +336,8 @@ jobs:
- uses: actions/download-artifact@v4
with:
- name: artifact
path: dist
+ merge-multiple: true
- uses: pypa/gh-action-pypi-publish@release/v1
with:
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index f99d47c711c..5c355f41145 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -102,7 +102,7 @@ jobs:
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: package
run: |
@@ -134,7 +134,7 @@ jobs:
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: package
run: |
@@ -166,7 +166,7 @@ jobs:
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: package
run: |
@@ -186,7 +186,7 @@ jobs:
steps:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-macos-release-11.0.0-20230504
@@ -407,7 +407,7 @@ jobs:
steps:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-ios-release-11.0.0-20230504
@@ -677,7 +677,7 @@ jobs:
steps:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-ios-bitcode-release-11.0.0-20230504
@@ -947,7 +947,7 @@ jobs:
steps:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-ios-simulator-release-11.0.0-20230504
@@ -1217,7 +1217,7 @@ jobs:
steps:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-ios-simulator-bitcode-release-11.0.0-20230504
@@ -1487,7 +1487,7 @@ jobs:
steps:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-mac-catalyst-release-11.0.0-20230504
@@ -1731,7 +1731,7 @@ jobs:
steps:
- name: cache-openmp
id: cache-openmp
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: openmp-install
key: openmp-mac-catalyst-bitcode-release-11.0.0-20230504
@@ -2185,7 +2185,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-aarch64
run: |
@@ -2193,7 +2193,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-x86
run: |
@@ -2201,7 +2201,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-x86_64
run: |
@@ -2209,7 +2209,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: package
run: |
@@ -2242,7 +2242,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-aarch64
run: |
@@ -2250,7 +2250,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-x86
run: |
@@ -2258,7 +2258,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-x86_64
run: |
@@ -2266,7 +2266,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: package
run: |
@@ -2299,9 +2299,9 @@ jobs:
run: |
mkdir build-armv7 && cd build-armv7
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
- -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 \
+ -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-aarch64
run: |
@@ -2309,15 +2309,15 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-x86
run: |
mkdir build-x86 && cd build-x86
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
- -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-21 \
+ -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-x86_64
run: |
@@ -2325,7 +2325,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: package
run: |
@@ -2358,9 +2358,9 @@ jobs:
run: |
mkdir build-armv7 && cd build-armv7
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
- -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 \
+ -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-aarch64
run: |
@@ -2368,15 +2368,15 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-x86
run: |
mkdir build-x86 && cd build-x86
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
- -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-21 \
+ -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-x86_64
run: |
@@ -2384,7 +2384,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: package
run: |
@@ -2422,7 +2422,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-simd
run: |
@@ -2431,7 +2431,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-threads
run: |
@@ -2440,7 +2440,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: build-simd-threads
run: |
@@ -2449,7 +2449,7 @@ jobs:
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
cmake --build . --target install/strip
- name: package
run: |
@@ -2479,7 +2479,7 @@ jobs:
submodules: true
- name: cache-protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: "protobuf-install"
key: protobuf-vs2015-x86-x64-install
@@ -2491,24 +2491,24 @@ jobs:
cd protobuf-3.11.2
mkdir build-x86; cd build-x86;
cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
cd ..
mkdir build-x64; cd build-x64;
cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x86
run: |
mkdir build-x86; cd build-x86
cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x64
run: |
mkdir build-x64; cd build-x64
cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: package
run: |
@@ -2536,7 +2536,7 @@ jobs:
submodules: true
- name: cache-protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: "protobuf-install"
key: protobuf-vs2015-x86-x64-install
@@ -2548,24 +2548,24 @@ jobs:
cd protobuf-3.11.2
mkdir build-x86; cd build-x86;
cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
cd ..
mkdir build-x64; cd build-x64;
cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x86
run: |
mkdir build-x86; cd build-x86
cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x64
run: |
mkdir build-x64; cd build-x64
cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: package
run: |
@@ -2593,7 +2593,7 @@ jobs:
submodules: true
- name: cache-protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: "protobuf-install"
key: protobuf-vs2017-x86-x64-install
@@ -2605,24 +2605,24 @@ jobs:
cd protobuf-3.11.2
mkdir build-x86; cd build-x86;
cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
cd ..
mkdir build-x64; cd build-x64;
cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x86
run: |
mkdir build-x86; cd build-x86
cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x64
run: |
mkdir build-x64; cd build-x64
cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: package
run: |
@@ -2650,7 +2650,7 @@ jobs:
submodules: true
- name: cache-protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: "protobuf-install"
key: protobuf-vs2017-x86-x64-install
@@ -2662,24 +2662,24 @@ jobs:
cd protobuf-3.11.2
mkdir build-x86; cd build-x86;
cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
cd ..
mkdir build-x64; cd build-x64;
cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x86
run: |
mkdir build-x86; cd build-x86
cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x64
run: |
mkdir build-x64; cd build-x64
cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: package
run: |
@@ -2707,7 +2707,7 @@ jobs:
submodules: true
- name: cache-protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: "protobuf-install"
key: protobuf-vs2019-x86-x64-install
@@ -2719,36 +2719,36 @@ jobs:
cd protobuf-3.11.2
mkdir build-x86; cd build-x86;
cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
cd ..
mkdir build-x64; cd build-x64;
cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x86
run: |
mkdir build-x86; cd build-x86
cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x64
run: |
mkdir build-x64; cd build-x64
cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-arm
run: |
mkdir build-arm; cd build-arm
cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-arm64
run: |
mkdir build-arm64; cd build-arm64
cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: package
run: |
@@ -2780,7 +2780,7 @@ jobs:
submodules: true
- name: cache-protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: "protobuf-install"
key: protobuf-vs2019-x86-x64-install
@@ -2792,36 +2792,36 @@ jobs:
cd protobuf-3.11.2
mkdir build-x86; cd build-x86;
cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
cd ..
mkdir build-x64; cd build-x64;
cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x86
run: |
mkdir build-x86; cd build-x86
cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x64
run: |
mkdir build-x64; cd build-x64
cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-arm
run: |
mkdir build-arm; cd build-arm
cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-arm64
run: |
mkdir build-arm64; cd build-arm64
cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: package
run: |
@@ -2853,7 +2853,7 @@ jobs:
submodules: true
- name: cache-protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: "protobuf-install"
key: protobuf-vs2022-x86-x64-install
@@ -2865,36 +2865,36 @@ jobs:
cd protobuf-3.11.2
mkdir build-x86; cd build-x86;
cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
cd ..
mkdir build-x64; cd build-x64;
cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x86
run: |
mkdir build-x86; cd build-x86
cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x64
run: |
mkdir build-x64; cd build-x64
cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-arm
run: |
mkdir build-arm; cd build-arm
cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-arm64
run: |
mkdir build-arm64; cd build-arm64
cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: package
run: |
@@ -2926,7 +2926,7 @@ jobs:
submodules: true
- name: cache-protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: "protobuf-install"
key: protobuf-vs2022-x86-x64-install
@@ -2938,36 +2938,36 @@ jobs:
cd protobuf-3.11.2
mkdir build-x86; cd build-x86;
cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
cd ..
mkdir build-x64; cd build-x64;
cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x86
run: |
mkdir build-x86; cd build-x86
cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-x64
run: |
mkdir build-x64; cd build-x64
cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-arm
run: |
mkdir build-arm; cd build-arm
cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-arm64
run: |
mkdir build-arm64; cd build-arm64
cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: package
run: |
@@ -3197,7 +3197,7 @@ jobs:
path: artifacts
- name: create-release
- uses: softprops/action-gh-release@v1
+ uses: softprops/action-gh-release@v2
with:
token: ${{ secrets.GITHUB_TOKEN }}
tag_name: ${{ needs.setup.outputs.VERSION }}
diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml
index 7aa9c58d27e..83e6328bb22 100644
--- a/.github/workflows/test-coverage.yml
+++ b/.github/workflows/test-coverage.yml
@@ -119,7 +119,7 @@ jobs:
- name: lcov
run: sudo apt-get install lcov
- name: Setup SDE binaries
- uses: petarpetrovt/setup-sde@v2.3
+ uses: petarpetrovt/setup-sde@v2.4
- name: build-avx512-spr
env:
CC: gcc-12
diff --git a/.github/workflows/web-assembly.yml b/.github/workflows/web-assembly.yml
index f997f9dc2a3..1b5e8915a86 100644
--- a/.github/workflows/web-assembly.yml
+++ b/.github/workflows/web-assembly.yml
@@ -47,30 +47,30 @@ jobs:
export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1"
mkdir build-basic && cd build-basic
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-basic
run: |
cd build-basic
- TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j $(nproc)
- name: build-simd
run: |
source emsdk/emsdk_env.sh
export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1"
mkdir build-simd && cd build-simd
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-simd
run: |
cd build-simd
- TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd" ctest --output-on-failure -j $(nproc)
- name: build-simd-omp
run: |
source emsdk/emsdk_env.sh
export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1"
mkdir build-simd-omp && cd build-simd-omp
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . -j 2
+ cmake --build . -j $(nproc)
- name: test-simd-omp
run: |
cd build-simd-omp
- TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd;--experimental-wasm-threads" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd;--experimental-wasm-threads" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/windows-arm-cpu.yml b/.github/workflows/windows-arm-cpu.yml
index 301715b833f..6cf29356c07 100644
--- a/.github/workflows/windows-arm-cpu.yml
+++ b/.github/workflows/windows-arm-cpu.yml
@@ -49,9 +49,9 @@ jobs:
run: |
mkdir build; cd build
cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: build-shared
run: |
mkdir build-shared; cd build-shared
cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
diff --git a/.github/workflows/windows-arm-gpu.yml b/.github/workflows/windows-arm-gpu.yml
index 70db051ac56..787ffdbdc76 100644
--- a/.github/workflows/windows-arm-gpu.yml
+++ b/.github/workflows/windows-arm-gpu.yml
@@ -51,9 +51,9 @@ jobs:
run: |
mkdir build; cd build
cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: build-shared
run: |
mkdir build-shared; cd build-shared
cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
diff --git a/.github/workflows/windows-arm64-cpu.yml b/.github/workflows/windows-arm64-cpu.yml
index 1fded4ac622..7032385ead0 100644
--- a/.github/workflows/windows-arm64-cpu.yml
+++ b/.github/workflows/windows-arm64-cpu.yml
@@ -49,12 +49,12 @@ jobs:
run: |
mkdir build; cd build
cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: build-shared
run: |
mkdir build-shared; cd build-shared
cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
woa-linux:
name: woa-linux
@@ -82,8 +82,8 @@ jobs:
export PATH=/msvc/bin/arm64:$PATH
mkdir build && cd build
cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_SYSTEM_NAME=Windows -DNCNN_BUILD_TESTS=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j $(nproc)
- name: test
run: |
cd build
- TESTS_EXECUTABLE_LOADER=wine-arm64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="" ctest --output-on-failure -j 2
+ TESTS_EXECUTABLE_LOADER=wine-arm64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/windows-arm64-gpu.yml b/.github/workflows/windows-arm64-gpu.yml
index cb5f9fad430..fa1b8994b2c 100644
--- a/.github/workflows/windows-arm64-gpu.yml
+++ b/.github/workflows/windows-arm64-gpu.yml
@@ -51,9 +51,9 @@ jobs:
run: |
mkdir build; cd build
cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: build-shared
run: |
mkdir build-shared; cd build-shared
cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
diff --git a/.github/workflows/windows-x64-cpu-vs2019-python.yml b/.github/workflows/windows-x64-cpu-vs2019-python.yml
index e1f956a4688..3df91ab878b 100644
--- a/.github/workflows/windows-x64-cpu-vs2019-python.yml
+++ b/.github/workflows/windows-x64-cpu-vs2019-python.yml
@@ -50,7 +50,7 @@ jobs:
run: |
mkdir build; cd build
cmake -T v142,host=x64 -A x64 -DNCNN_PYTHON=ON -DNCNN_DISABLE_RTTI=OFF -DNCNN_DISABLE_EXCEPTION=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: install python
run: cd python && pip install .
- name: test
diff --git a/.github/workflows/windows-x64-cpu.yml b/.github/workflows/windows-x64-cpu.yml
index 604d31406ca..67def785c7c 100644
--- a/.github/workflows/windows-x64-cpu.yml
+++ b/.github/workflows/windows-x64-cpu.yml
@@ -61,7 +61,7 @@ jobs:
- uses: actions/checkout@v4
- name: cache-protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: "protobuf-install"
key: protobuf-${{ matrix.vs-version }}-x64-install-2
@@ -72,31 +72,31 @@ jobs:
7z x ./protobuf-3.11.2.zip
cd protobuf-3.11.2
mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}; cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DNCNN_BUILD_TESTS=ON ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: build-sse2
run: |
mkdir build-sse2; cd build-sse2
cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: test-sse2
- run: cd build-sse2; ctest -C Release --output-on-failure -j 2
+ run: cd build-sse2; ctest -C Release --output-on-failure -j 4
- name: build-shared
run: |
mkdir build-shared; cd build-shared
cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_RUNTIME_CPU=ON -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: build-avx2
run: |
mkdir build-avx2; cd build-avx2
cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_BUILD_TESTS=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: test-avx2
- run: cd build-avx2; ctest -C Release --output-on-failure -j 2
+ run: cd build-avx2; ctest -C Release --output-on-failure -j 4
- name: build-avx
run: |
mkdir build-avx; cd build-avx
cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: test-avx
- run: cd build-avx; ctest -C Release --output-on-failure -j 2
+ run: cd build-avx; ctest -C Release --output-on-failure -j 4
diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml
index 1272b7ed920..57be84a602f 100644
--- a/.github/workflows/windows-x64-gpu.yml
+++ b/.github/workflows/windows-x64-gpu.yml
@@ -65,7 +65,7 @@ jobs:
submodules: true
- name: cache-protobuf
id: cache-protobuf
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: "protobuf-install"
key: protobuf-${{ matrix.vs-version }}-x64-install-2
@@ -76,12 +76,12 @@ jobs:
7z x ./protobuf-3.11.2.zip
cd protobuf-3.11.2
mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}; cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DNCNN_BUILD_TESTS=ON ../cmake
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
cmake --build . --config Release --target install
- name: cache-swiftshader
if: matrix.vs-version != 'vs2015'
id: cache-swiftshader
- uses: actions/cache@v3
+ uses: actions/cache@v4
with:
path: swiftshader-install
key: swiftshader-${{ matrix.vs-version }}-x64-install-20230420
@@ -103,22 +103,22 @@ jobs:
cd swiftshader
mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}
cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
mkdir "$env:GITHUB_WORKSPACE/swiftshader-install"
Copy-Item -Path "Windows\*" -Destination "$env:GITHUB_WORKSPACE\swiftshader-install"
- name: build
run: |
mkdir build; cd build
cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: test
if: matrix.vs-version != 'vs2015'
run: |
echo "[Processor]`nThreadCount=1`n" > build/tests/Release/SwiftShader.ini
Copy-Item -Path "$env:GITHUB_WORKSPACE\swiftshader-install\vulkan-1.dll" -Destination 'build\tests'
- cd build; ctest -C Release --output-on-failure -j 2
+ cd build; ctest -C Release --output-on-failure -j 4
- name: build-shared
run: |
mkdir build-shared; cd build-shared
cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
diff --git a/.github/workflows/windows-x86-cpu.yml b/.github/workflows/windows-x86-cpu.yml
index 26d9aaf8b72..68f09157627 100644
--- a/.github/workflows/windows-x86-cpu.yml
+++ b/.github/workflows/windows-x86-cpu.yml
@@ -57,11 +57,11 @@ jobs:
run: |
mkdir build; cd build
cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: test
- run: cd build; ctest -C Release --output-on-failure -j 2
+ run: cd build; ctest -C Release --output-on-failure -j 4
- name: build-shared
run: |
mkdir build-shared; cd build-shared
cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
diff --git a/.github/workflows/windows-x86-gpu.yml b/.github/workflows/windows-x86-gpu.yml
index 4161025f481..4f84665b479 100644
--- a/.github/workflows/windows-x86-gpu.yml
+++ b/.github/workflows/windows-x86-gpu.yml
@@ -59,9 +59,9 @@ jobs:
run: |
mkdir build; cd build
cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
- name: build-shared
run: |
mkdir build-shared; cd build-shared
cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
- cmake --build . --config Release -j 2
+ cmake --build . --config Release -j 4
diff --git a/.gitignore b/.gitignore
index 2c71aee0332..cd69c526f19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,10 +54,9 @@ __pycache__
*.pyd
*.egg-info/
python/setup.py
-tools/pnnx/python/setup.py
# Clangd
.cache/
# Xmake
-.xmake/
\ No newline at end of file
+.xmake/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ffd677bb33..785e2cd3926 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,11 +97,6 @@ else()
endif()
if(NCNN_SHARED_LIB)
- if(NCNN_BUILD_TESTS)
- message(WARNING "NCNN_SHARED_LIB must be OFF to build tests! NCNN_BUILD_TESTS will be turned off.")
- set(NCNN_BUILD_TESTS OFF)
- endif()
-
if(NCNN_ENABLE_LTO)
# enable global link time optimization
cmake_policy(SET CMP0069 NEW)
diff --git a/README.md b/README.md
index 3f1904d8f15..a4b2876a5e2 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-![NCNN](https://raw.githubusercontent.com/Tencent/ncnn/master/images/256-ncnn.png)
+![ncnn](https://raw.githubusercontent.com/Tencent/ncnn/master/images/256-ncnn.png)
# ncnn
@@ -6,12 +6,12 @@
[![Download Total Count](https://img.shields.io/github/downloads/Tencent/ncnn/total.svg?style=for-the-badge)](https://github.com/Tencent/ncnn/releases)
[![codecov](https://img.shields.io/codecov/c/github/Tencent/ncnn/master?style=for-the-badge)](https://codecov.io/gh/Tencent/ncnn)
-ncnn is a high-performance neural network inference computing framework optimized for mobile platforms.
+ncnn is a high-performance neural network inference computing framework optimized for mobile platforms.
ncnn is deeply considerate about deployment and uses on mobile phones from the beginning of design.
-ncnn does not have third party dependencies. It is cross-platform, and runs faster than all known open source frameworks on mobile phone cpu.
-Developers can easily deploy deep learning algorithm models to the mobile platform by using efficient ncnn implementation,
-create intelligent APPs, and bring the artificial intelligence to your fingertips.
-ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu and so on.
+ncnn does not have third-party dependencies.
+It is cross-platform and runs faster than all known open-source frameworks on mobile phone cpu.
+Developers can easily deploy deep learning algorithm models to the mobile platform by using efficient ncnn implementation, creating intelligent APPs, and bringing artificial intelligence to your fingertips.
+ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu, and so on.
ncnn 是一个为手机端极致优化的高性能神经网络前向计算框架。
ncnn 从设计之初深刻考虑手机端的部署和使用。
@@ -29,12 +29,12 @@ ncnn 目前已在腾讯多款应用中使用,如:QQ,Qzone,微信,天
637093648 (超多大佬)
答案:卷卷卷卷卷(已满)
-
+ |
Telegram Group
|
-
+ |
Discord Channel
@@ -47,6 +47,12 @@ ncnn 目前已在腾讯多款应用中使用,如:QQ,Qzone,微信,天
答案:multi-level intermediate representation
|
+
+
+他们都不知道 pnnx 有多好用群
+818998520 (新群!)
+ |
+
---
@@ -71,7 +77,7 @@ https://github.com/Tencent/ncnn/releases/latest
Source |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-full-source.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-full-source.zip)
|
@@ -91,8 +97,8 @@ https://github.com/Tencent/ncnn/releases/latest
Android |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-android-vulkan.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-android-vulkan-shared.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-android-vulkan.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-android-vulkan-shared.zip)
|
@@ -105,8 +111,8 @@ https://github.com/Tencent/ncnn/releases/latest
| Android cpuonly |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-android.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-android-shared.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-android.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-android-shared.zip)
|
@@ -125,8 +131,8 @@ https://github.com/Tencent/ncnn/releases/latest
iOS |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-vulkan.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-vulkan-bitcode.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-vulkan.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-vulkan-bitcode.zip)
|
@@ -139,8 +145,8 @@ https://github.com/Tencent/ncnn/releases/latest
| iOS cpuonly |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-bitcode.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-bitcode.zip)
|
@@ -148,8 +154,8 @@ https://github.com/Tencent/ncnn/releases/latest
iOS-Simulator |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-simulator-vulkan.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-simulator-vulkan-bitcode.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-simulator-vulkan.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-simulator-vulkan-bitcode.zip)
|
@@ -162,14 +168,14 @@ https://github.com/Tencent/ncnn/releases/latest
| iOS-Simulator cpuonly |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-simulator.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-simulator-bitcode.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-simulator.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-simulator-bitcode.zip)
|
-
+ |
|
@@ -182,7 +188,7 @@ https://github.com/Tencent/ncnn/releases/latest
| macOS |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-macos-vulkan.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-macos-vulkan.zip)
|
@@ -195,7 +201,7 @@ https://github.com/Tencent/ncnn/releases/latest
| macOS cpuonly |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-macos.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-macos.zip)
|
@@ -203,8 +209,8 @@ https://github.com/Tencent/ncnn/releases/latest
Mac-Catalyst |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-mac-catalyst-vulkan.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-mac-catalyst-vulkan-bitcode.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-mac-catalyst-vulkan.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-mac-catalyst-vulkan-bitcode.zip)
|
@@ -217,8 +223,50 @@ https://github.com/Tencent/ncnn/releases/latest
| Mac-Catalyst cpuonly |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-mac-catalyst.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-mac-catalyst-bitcode.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-mac-catalyst.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-mac-catalyst-bitcode.zip)
+
+ |
+
+
+watchOS |
+
+
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-watchos.zip)
+
+ |
+
+
+ [](https://github.com/Tencent/ncnn/actions?query=workflow%3Awatchos-cpu)
+
+ |
+
+
+watchOS-Simulator |
+
+
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-watchos-simulator.zip)
+
+ |
+
+
+tvOS |
+
+
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-tvos.zip)
+
+ |
+
+
+ [](https://github.com/Tencent/ncnn/actions?query=workflow%3Atvos-cpu)
+
+ |
+
+
+tvOS-Simulator |
+
+
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-tvos-simulator.zip)
|
@@ -226,8 +274,8 @@ https://github.com/Tencent/ncnn/releases/latest
Apple xcframework |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-apple-vulkan.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-apple-vulkan-bitcode.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-apple-vulkan.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-apple-vulkan-bitcode.zip)
|
@@ -238,8 +286,8 @@ https://github.com/Tencent/ncnn/releases/latest
| Apple xcframework cpuonly |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-apple.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-apple-bitcode.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-apple.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-apple-bitcode.zip)
|
@@ -258,8 +306,8 @@ https://github.com/Tencent/ncnn/releases/latest
Ubuntu 20.04 |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ubuntu-2004.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ubuntu-2004-shared.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ubuntu-2004.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ubuntu-2004-shared.zip)
|
@@ -272,8 +320,8 @@ https://github.com/Tencent/ncnn/releases/latest
| Ubuntu 22.04 |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ubuntu-2204.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ubuntu-2204-shared.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ubuntu-2204.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ubuntu-2204-shared.zip)
|
@@ -292,8 +340,8 @@ https://github.com/Tencent/ncnn/releases/latest
VS2015 |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2015.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2015-shared.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2015.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2015-shared.zip)
|
@@ -306,8 +354,8 @@ https://github.com/Tencent/ncnn/releases/latest
| VS2017 |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2017.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2017-shared.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2017.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2017-shared.zip)
|
@@ -315,8 +363,8 @@ https://github.com/Tencent/ncnn/releases/latest
VS2019 |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2019.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2019-shared.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2019.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2019-shared.zip)
|
@@ -324,8 +372,8 @@ https://github.com/Tencent/ncnn/releases/latest
VS2022 |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2022.zip)
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2022-shared.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2022.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2022-shared.zip)
|
@@ -344,7 +392,7 @@ https://github.com/Tencent/ncnn/releases/latest
WebAssembly |
- [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-webassembly.zip)
+ [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-webassembly.zip)
|
diff --git a/benchmark/README.md b/benchmark/README.md
index d7cae38a242..6cb198e8973 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -1928,6 +1928,50 @@ cooling_down = 1
yolo-fastestv2 min = 316.93 max = 319.86 avg = 318.33
```
+### Radxa Zero 3W, Cortex-A55 (ARMv82) (1.416 GHz x 4)
+```
+loop_count = 10
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 34.51 max = 106.19 avg = 79.43
+ squeezenet_int8 min = 31.48 max = 49.87 avg = 34.65
+ mobilenet min = 42.23 max = 45.36 avg = 42.89
+ mobilenet_int8 min = 35.97 max = 53.84 avg = 38.77
+ mobilenet_v2 min = 39.61 max = 40.35 avg = 40.00
+ mobilenet_v3 min = 31.19 max = 31.85 avg = 31.50
+ shufflenet min = 24.75 max = 27.74 avg = 25.55
+ shufflenet_v2 min = 22.00 max = 22.70 avg = 22.31
+ mnasnet min = 34.95 max = 53.55 avg = 37.39
+ proxylessnasnet min = 39.96 max = 44.32 avg = 40.81
+ efficientnet_b0 min = 49.76 max = 67.77 avg = 52.61
+ efficientnetv2_b0 min = 64.00 max = 85.78 avg = 67.06
+ regnety_400m min = 55.23 max = 73.22 avg = 57.87
+ blazeface min = 7.80 max = 10.39 avg = 8.27
+ googlenet min = 98.24 max = 118.27 avg = 101.78
+ googlenet_int8 min = 98.81 max = 115.66 avg = 101.52
+ resnet18 min = 75.33 max = 88.59 avg = 78.19
+ resnet18_int8 min = 76.31 max = 95.17 avg = 79.03
+ alexnet min = 65.07 max = 73.80 avg = 67.18
+ vgg16 min = 423.20 max = 455.15 avg = 436.32
+ vgg16_int8 min = 591.82 max = 620.22 avg = 607.55
+ resnet50 min = 185.53 max = 207.10 avg = 193.03
+ resnet50_int8 min = 176.84 max = 194.73 avg = 181.81
+ squeezenet_ssd min = 96.64 max = 118.46 avg = 100.86
+ squeezenet_ssd_int8 min = 96.61 max = 123.48 avg = 104.64
+ mobilenet_ssd min = 95.38 max = 110.52 avg = 98.61
+ mobilenet_ssd_int8 min = 76.21 max = 95.41 avg = 79.10
+ mobilenet_yolo min = 210.73 max = 235.47 avg = 221.72
+ mobilenetv2_yolov3 min = 134.59 max = 154.33 avg = 139.54
+ yolov4-tiny min = 167.79 max = 191.60 avg = 171.25
+ nanodet_m min = 63.22 max = 80.73 avg = 66.25
+ yolo-fastest-1.1 min = 32.87 max = 88.05 avg = 47.36
+ yolo-fastestv2 min = 26.03 max = 27.01 avg = 26.54
+ vision_transformer min = 3682.51 max = 3882.79 avg = 3809.42
+ FastestDet min = 30.69 max = 50.65 avg = 33.65
+```
+
### Avaota Aim T527, Allwinner T527 (Cortex-A55 2.2GHz x 4 + Cortex-A55 1.8GHz x 4)
```
@@ -2647,6 +2691,7 @@ cooling_down = 0
```
### Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4)
+
```
nanopc-t4:/data/local/tmp # ./benchncnn 8 2 2 -1 1
loop_count = 8
@@ -2845,7 +2890,95 @@ cooling_down = 0
yolo-fastestv2 min = 24.94 max = 25.07 avg = 25.01
```
+### MYIR RemiPi,Renesas RZG2L(Cortex-A55 1.5GHz x 2)
+```
+root@myir-remi-1g:~/ncnn# time ./benchncnn 10 4 0 -1 1
+loop_count = 10
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 85.38 max = 87.72 avg = 86.78
+ squeezenet_int8 min = 84.23 max = 86.46 avg = 85.59
+ mobilenet min = 121.01 max = 122.55 avg = 121.76
+ mobilenet_int8 min = 95.64 max = 97.27 avg = 96.25
+ mobilenet_v2 min = 101.35 max = 102.24 avg = 101.72
+ mobilenet_v3 min = 84.09 max = 86.66 avg = 84.86
+ shufflenet min = 63.32 max = 65.16 avg = 64.53
+ shufflenet_v2 min = 60.33 max = 62.35 avg = 61.04
+ mnasnet min = 95.51 max = 96.70 avg = 95.95
+ proxylessnasnet min = 124.46 max = 125.82 avg = 125.14
+ efficientnet_b0 min = 144.94 max = 146.46 avg = 145.56
+ efficientnetv2_b0 min = 182.87 max = 185.63 avg = 184.56
+ regnety_400m min = 105.31 max = 106.42 avg = 105.72
+ blazeface min = 21.34 max = 21.90 avg = 21.50
+ googlenet min = 313.01 max = 318.42 avg = 314.25
+ googlenet_int8 min = 301.87 max = 304.93 avg = 303.66
+ resnet18 min = 248.02 max = 253.93 avg = 250.12
+ resnet18_int8 min = 244.65 max = 246.62 avg = 245.66
+ alexnet min = 204.00 max = 206.39 avg = 205.21
+ resnet50 min = 583.13 max = 584.82 avg = 584.11
+ resnet50_int8 min = 517.42 max = 520.97 avg = 519.07
+ squeezenet_ssd min = 266.63 max = 273.34 avg = 268.60
+ squeezenet_ssd_int8 min = 255.42 max = 260.98 avg = 257.15
+ mobilenet_ssd min = 267.16 max = 270.41 avg = 268.20
+ mobilenet_ssd_int8 min = 205.03 max = 206.43 avg = 205.53
+ mobilenet_yolo min = 571.08 max = 576.15 avg = 574.18
+ mobilenetv2_yolov3 min = 342.52 max = 344.84 avg = 343.38
+ yolov4-tiny min = 499.74 max = 503.13 avg = 501.45
+ nanodet_m min = 161.87 max = 163.90 avg = 162.93
+ yolo-fastest-1.1 min = 72.84 max = 74.81 avg = 73.35
+ yolo-fastestv2 min = 68.24 max = 70.49 avg = 68.74
+ vision_transformer min = 12464.09 max = 12491.57 avg = 12475.63
+ FastestDet min = 67.92 max = 69.90 avg = 68.94
+```
+
+### OrangePi Zero 2, Allwinner H616 (Cortex-A53 1.5GHz x 4)
+
+```
+orangepi@zero2:~/ncnn/benchmark$ ./benchncnn 10 4 0 -1 1
+loop_count = 10
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 76.25 max = 90.20 avg = 78.99
+ squeezenet_int8 min = 59.92 max = 60.44 avg = 60.10
+ mobilenet min = 106.91 max = 132.22 avg = 109.99
+ mobilenet_int8 min = 57.96 max = 59.06 avg = 58.19
+ mobilenet_v2 min = 97.93 max = 124.48 avg = 100.91
+ mobilenet_v3 min = 82.27 max = 83.93 avg = 83.00
+ shufflenet min = 55.27 max = 82.06 avg = 58.40
+ shufflenet_v2 min = 44.94 max = 71.99 avg = 48.10
+ mnasnet min = 90.66 max = 91.41 avg = 90.92
+ proxylessnasnet min = 91.55 max = 118.74 avg = 94.71
+ efficientnet_b0 min = 127.95 max = 155.13 avg = 131.25
+ efficientnetv2_b0 min = 145.96 max = 173.67 avg = 149.36
+ regnety_400m min = 102.83 max = 103.52 avg = 103.08
+ blazeface min = 14.46 max = 14.95 avg = 14.77
+ googlenet min = 217.71 max = 244.16 avg = 221.38
+ googlenet_int8 min = 163.04 max = 187.69 avg = 166.20
+ resnet18 min = 251.45 max = 277.52 avg = 255.00
+ resnet18_int8 min = 136.54 max = 161.95 avg = 141.60
+ alexnet min = 212.07 max = 233.27 avg = 215.34
+ vgg16 min = 1206.92 max = 1981.79 avg = 1673.28
+ vgg16_int8 min = 622.93 max = 702.12 avg = 661.83
+ resnet50 min = 555.84 max = 643.69 avg = 576.17
+ resnet50_int8 min = 348.11 max = 374.25 avg = 354.17
+ squeezenet_ssd min = 224.68 max = 251.32 avg = 230.59
+ squeezenet_ssd_int8 min = 154.87 max = 182.66 avg = 159.08
+ mobilenet_ssd min = 238.49 max = 426.65 avg = 263.18
+ mobilenet_ssd_int8 min = 118.36 max = 138.39 avg = 120.78
+ mobilenet_yolo min = 500.28 max = 615.83 avg = 553.59
+ mobilenetv2_yolov3 min = 340.27 max = 369.13 avg = 347.17
+ yolov4-tiny min = 365.04 max = 408.48 avg = 383.93
+ nanodet_m min = 112.88 max = 141.85 avg = 116.13
+ yolo-fastest-1.1 min = 72.05 max = 73.46 avg = 72.68
+ yolo-fastestv2 min = 54.94 max = 55.35 avg = 55.15
+ vision_transformer min = 6842.19 max = 9125.07 avg = 7343.64
+ FastestDet min = 59.09 max = 59.87 avg = 59.35
+```
### OrangePi4 LTS, Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4)
Test Ubuntu 22.04 Gnome Desktop
@@ -6803,98 +6936,54 @@ cooling_down = 0
FastestDet min = 4.34 max = 7.47 avg = 5.18
```
-### AWS c5.4xlarge Instance (Intel Xeon Platinum 8124M @ 3.399GHz, Ubuntu 20.04.6 LTS x86_64)
+### AWS c5.4xlarge Instance
-icpc (ICC) 2021.9.0 20230302
+- OS: Ubuntu 20.04.6 LTS x86_64
+- CPU: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
+- Compiler: gcc version 9.4.0 (Ubuntu 9.4.0-1ubuntu1~20.04.2)
+- ncnn tag: 20240102
```
-root@ip-172-31-3-216:/opt/ncnn-icc/benchmark# ../build/benchmark/benchncnn 4 8 0 -1
-loop_count = 4
-num_threads = 8
-powersave = 0
-gpu_device = -1
-cooling_down = 1
- squeezenet min = 3.23 max = 3.25 avg = 3.24
- squeezenet_int8 min = 5.97 max = 6.00 avg = 5.98
- mobilenet min = 3.44 max = 3.56 avg = 3.51
- mobilenet_int8 min = 4.53 max = 4.63 avg = 4.59
- mobilenet_v2 min = 3.82 max = 3.95 avg = 3.88
- mobilenet_v3 min = 3.49 max = 3.53 avg = 3.51
- shufflenet min = 3.56 max = 3.58 avg = 3.57
- shufflenet_v2 min = 3.47 max = 3.52 avg = 3.48
- mnasnet min = 3.69 max = 4.07 avg = 3.94
- proxylessnasnet min = 3.75 max = 3.81 avg = 3.78
- efficientnet_b0 min = 4.80 max = 4.85 avg = 4.82
- efficientnetv2_b0 min = 6.51 max = 6.65 avg = 6.59
- regnety_400m min = 8.20 max = 10.79 avg = 9.04
- blazeface min = 1.16 max = 1.19 avg = 1.18
- googlenet min = 9.92 max = 11.07 avg = 10.56
- googlenet_int8 min = 21.50 max = 21.67 avg = 21.59
- resnet18 min = 6.47 max = 6.57 avg = 6.52
- resnet18_int8 min = 19.01 max = 19.17 avg = 19.05
- alexnet min = 5.20 max = 5.28 avg = 5.25
- vgg16 min = 35.74 max = 35.88 avg = 35.79
- vgg16_int8 min = 38.20 max = 38.51 avg = 38.36
- resnet50 min = 14.02 max = 14.12 avg = 14.05
- resnet50_int8 min = 27.92 max = 28.12 avg = 28.03
- squeezenet_ssd min = 9.51 max = 9.70 avg = 9.60
- squeezenet_ssd_int8 min = 12.91 max = 13.06 avg = 12.97
- mobilenet_ssd min = 6.55 max = 6.65 avg = 6.59
- mobilenet_ssd_int8 min = 9.16 max = 9.23 avg = 9.20
- mobilenet_yolo min = 17.02 max = 17.22 avg = 17.13
- mobilenetv2_yolov3 min = 12.67 max = 12.78 avg = 12.71
- yolov4-tiny min = 23.42 max = 23.49 avg = 23.46
- nanodet_m min = 7.27 max = 7.30 avg = 7.28
- yolo-fastest-1.1 min = 4.05 max = 4.08 avg = 4.06
- yolo-fastestv2 min = 4.12 max = 4.15 avg = 4.13
- vision_transformer min = 135.25 max = 136.36 avg = 135.71
- FastestDet min = 4.12 max = 4.21 avg = 4.16
-```
-
-Intel(R) oneAPI DPC++/C++ Compiler 2023.1.0 (2023.1.0.20230320)
-
-```
-root@ip-172-31-3-216:/opt/ncnn-icx/benchmark# ../build/benchmark/benchncnn 4 8 0 -1
loop_count = 4
num_threads = 8
-powersave = 0
+powersave = 2
gpu_device = -1
cooling_down = 1
- squeezenet min = 3.13 max = 3.18 avg = 3.16
- squeezenet_int8 min = 4.04 max = 4.07 avg = 4.06
- mobilenet min = 2.99 max = 3.05 avg = 3.03
- mobilenet_int8 min = 3.68 max = 3.77 avg = 3.73
- mobilenet_v2 min = 3.88 max = 3.92 avg = 3.89
- mobilenet_v3 min = 3.60 max = 3.70 avg = 3.64
- shufflenet min = 3.52 max = 3.54 avg = 3.53
- shufflenet_v2 min = 3.61 max = 3.64 avg = 3.63
- mnasnet min = 3.51 max = 3.53 avg = 3.52
- proxylessnasnet min = 3.73 max = 3.78 avg = 3.75
- efficientnet_b0 min = 4.86 max = 4.95 avg = 4.91
- efficientnetv2_b0 min = 6.84 max = 6.97 avg = 6.91
- regnety_400m min = 7.83 max = 7.89 avg = 7.86
- blazeface min = 1.10 max = 1.13 avg = 1.11
- googlenet min = 9.80 max = 9.89 avg = 9.83
- googlenet_int8 min = 11.32 max = 11.42 avg = 11.37
- resnet18 min = 6.68 max = 6.74 avg = 6.72
- resnet18_int8 min = 8.86 max = 8.92 avg = 8.90
- alexnet min = 5.21 max = 5.25 avg = 5.22
- vgg16 min = 35.77 max = 35.92 avg = 35.88
- vgg16_int8 min = 29.64 max = 29.79 avg = 29.75
- resnet50 min = 14.11 max = 14.31 avg = 14.22
- resnet50_int8 min = 17.73 max = 18.01 avg = 17.86
- squeezenet_ssd min = 9.57 max = 9.65 avg = 9.61
- squeezenet_ssd_int8 min = 9.57 max = 9.67 avg = 9.63
- mobilenet_ssd min = 6.56 max = 6.61 avg = 6.59
- mobilenet_ssd_int8 min = 7.51 max = 7.72 avg = 7.58
- mobilenet_yolo min = 16.89 max = 17.08 avg = 17.00
- mobilenetv2_yolov3 min = 13.79 max = 13.93 avg = 13.86
- yolov4-tiny min = 24.91 max = 25.08 avg = 24.98
- nanodet_m min = 7.42 max = 7.47 avg = 7.44
- yolo-fastest-1.1 min = 4.02 max = 4.09 avg = 4.07
- yolo-fastestv2 min = 4.02 max = 4.04 avg = 4.03
- vision_transformer min = 135.54 max = 136.68 avg = 136.21
- FastestDet min = 4.06 max = 4.10 avg = 4.08
+ squeezenet min = 3.31 max = 3.33 avg = 3.32
+ squeezenet_int8 min = 3.87 max = 4.34 avg = 4.07
+ mobilenet min = 3.12 max = 3.20 avg = 3.17
+ mobilenet_int8 min = 3.32 max = 3.45 avg = 3.38
+ mobilenet_v2 min = 4.23 max = 4.43 avg = 4.33
+ mobilenet_v3 min = 3.82 max = 3.92 avg = 3.87
+ shufflenet min = 3.67 max = 3.72 avg = 3.69
+ shufflenet_v2 min = 4.08 max = 4.22 avg = 4.15
+ mnasnet min = 3.62 max = 3.69 avg = 3.64
+ proxylessnasnet min = 4.29 max = 4.59 avg = 4.37
+ efficientnet_b0 min = 5.32 max = 5.64 avg = 5.50
+ efficientnetv2_b0 min = 6.81 max = 6.88 avg = 6.85
+ regnety_400m min = 9.71 max = 9.77 avg = 9.74
+ blazeface min = 1.71 max = 2.57 avg = 2.10
+ googlenet min = 10.00 max = 10.09 avg = 10.05
+ googlenet_int8 min = 8.76 max = 8.79 avg = 8.77
+ resnet18 min = 6.55 max = 6.91 avg = 6.70
+ resnet18_int8 min = 5.63 max = 5.95 avg = 5.81
+ alexnet min = 4.88 max = 4.91 avg = 4.89
+ vgg16 min = 36.99 max = 37.04 avg = 37.01
+ vgg16_int8 min = 28.13 max = 28.57 avg = 28.31
+ resnet50 min = 13.99 max = 14.13 avg = 14.06
+ resnet50_int8 min = 12.49 max = 12.56 avg = 12.53
+ squeezenet_ssd min = 9.93 max = 10.04 avg = 9.98
+ squeezenet_ssd_int8 min = 9.51 max = 9.70 avg = 9.59
+ mobilenet_ssd min = 6.60 max = 6.63 avg = 6.61
+ mobilenet_ssd_int8 min = 6.95 max = 7.10 avg = 7.02
+ mobilenet_yolo min = 18.28 max = 18.44 avg = 18.35
+ mobilenetv2_yolov3 min = 13.26 max = 13.39 avg = 13.32
+ yolov4-tiny min = 25.14 max = 25.58 avg = 25.37
+ nanodet_m min = 7.71 max = 7.77 avg = 7.75
+ yolo-fastest-1.1 min = 4.69 max = 4.96 avg = 4.81
+ yolo-fastestv2 min = 4.84 max = 5.17 avg = 5.01
+ vision_transformer min = 139.34 max = 140.38 avg = 139.96
+ FastestDet min = 4.95 max = 5.12 avg = 5.06
```
### Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-11800H, NVIDIA GeForce RTX 3070 Laptop GPU)
@@ -7408,50 +7497,51 @@ cooling_down = 0
- Platform: Xunlei OneCloud (玩客云)
- OS: Armbian buster (20.12) armv7l
-- ncnn tag: 20231027
+- Compiler: gcc version 8.3.0 (Debian 8.3.0-6)
+- ncnn tag: 20240102
```
-mizu-bai@aml-s812:~/ncnn-20231027/benchmark$ ../build/benchmark/benchncnn 4 4 0 -1 1
+mizu-bai@aml-s812:~/ncnn-20240102/benchmark$ ../build/benchmark/benchncnn
loop_count = 4
num_threads = 4
-powersave = 0
+powersave = 2
gpu_device = -1
cooling_down = 1
- squeezenet min = 449.65 max = 636.24 avg = 549.81
- squeezenet_int8 min = 271.84 max = 471.03 avg = 418.24
- mobilenet min = 874.01 max = 1027.19 avg = 927.64
- mobilenet_int8 min = 358.16 max = 555.39 avg = 477.83
- mobilenet_v2 min = 455.49 max = 802.61 avg = 598.32
- mobilenet_v3 min = 388.48 max = 620.12 avg = 535.33
- shufflenet min = 269.15 max = 497.81 avg = 352.46
- shufflenet_v2 min = 220.64 max = 396.63 avg = 305.29
- mnasnet min = 422.92 max = 760.36 avg = 594.63
- proxylessnasnet min = 522.79 max = 889.06 avg = 742.49
- efficientnet_b0 min = 922.67 max = 1014.29 avg = 971.97
- efficientnetv2_b0 min = 1022.19 max = 1153.30 avg = 1092.78
- regnety_400m min = 652.96 max = 972.85 avg = 838.15
- blazeface min = 70.44 max = 131.64 avg = 93.18
- googlenet min = 1599.90 max = 1789.44 avg = 1701.07
- googlenet_int8 min = 925.61 max = 1185.10 avg = 1055.48
- resnet18 min = 1318.23 max = 1586.19 avg = 1422.16
- resnet18_int8 min = 558.06 max = 881.32 avg = 777.04
- alexnet min = 755.06 max = 1109.70 avg = 941.45
- vgg16 min = 6984.48 max = 7085.39 avg = 7024.48
- vgg16_int8 min = 3986.30 max = 4011.83 avg = 3997.05
- resnet50 min = 4196.40 max = 4256.91 avg = 4234.42
- resnet50_int8 min = 2403.39 max = 2630.50 avg = 2512.29
- squeezenet_ssd min = 1039.30 max = 1411.58 avg = 1199.95
- squeezenet_ssd_int8 min = 742.65 max = 952.13 avg = 812.48
- mobilenet_ssd min = 1772.72 max = 1993.07 avg = 1906.95
- mobilenet_ssd_int8 min = 893.49 max = 1076.65 avg = 998.14
- mobilenet_yolo min = 4177.06 max = 4403.88 avg = 4300.91
- mobilenetv2_yolov3 min = 2182.82 max = 2240.35 avg = 2207.98
- yolov4-tiny min = 2441.59 max = 2817.29 avg = 2594.33
- nanodet_m min = 577.75 max = 925.98 avg = 803.88
- yolo-fastest-1.1 min = 247.65 max = 497.52 avg = 311.91
- yolo-fastestv2 min = 207.27 max = 398.13 avg = 314.22
- vision_transformer min = 18775.75 max = 19008.69 avg = 18906.63
- FastestDet min = 296.48 max = 466.79 avg = 354.53
+ squeezenet min = 376.45 max = 445.48 avg = 408.08
+ squeezenet_int8 min = 247.06 max = 340.34 avg = 281.40
+ mobilenet min = 696.71 max = 745.63 avg = 718.49
+ mobilenet_int8 min = 355.78 max = 472.06 avg = 401.17
+ mobilenet_v2 min = 428.86 max = 491.25 avg = 458.45
+ mobilenet_v3 min = 361.78 max = 425.90 avg = 396.94
+ shufflenet min = 245.90 max = 333.41 avg = 293.46
+ shufflenet_v2 min = 210.69 max = 329.51 avg = 260.73
+ mnasnet min = 418.49 max = 493.40 avg = 448.95
+ proxylessnasnet min = 542.20 max = 566.65 avg = 554.75
+ efficientnet_b0 min = 727.72 max = 785.47 avg = 750.72
+ efficientnetv2_b0 min = 805.70 max = 874.57 avg = 843.87
+ regnety_400m min = 627.74 max = 686.57 avg = 660.60
+ blazeface min = 62.14 max = 121.32 avg = 82.10
+ googlenet min = 1295.31 max = 1411.88 avg = 1342.26
+ googlenet_int8 min = 796.39 max = 860.28 avg = 823.76
+ resnet18 min = 1076.93 max = 1125.12 avg = 1099.37
+ resnet18_int8 min = 587.12 max = 634.97 avg = 605.29
+ alexnet min = 701.70 max = 729.68 avg = 718.99
+ vgg16 min = 5584.13 max = 5748.84 avg = 5660.70
+ vgg16_int8 min = 3107.89 max = 3138.78 avg = 3121.28
+ resnet50 min = 3378.84 max = 3461.61 avg = 3425.38
+ resnet50_int8 min = 2044.93 max = 2067.70 avg = 2061.38
+ squeezenet_ssd min = 908.77 max = 972.68 avg = 939.98
+ squeezenet_ssd_int8 min = 609.58 max = 703.88 avg = 662.43
+ mobilenet_ssd min = 1524.69 max = 1589.79 avg = 1552.12
+ mobilenet_ssd_int8 min = 817.70 max = 885.45 avg = 840.30
+ mobilenet_yolo min = 3497.13 max = 3605.83 avg = 3543.72
+ mobilenetv2_yolov3 min = 1734.10 max = 1824.98 avg = 1795.42
+ yolov4-tiny min = 2093.70 max = 2163.44 avg = 2128.30
+ nanodet_m min = 593.75 max = 647.03 avg = 608.03
+ yolo-fastest-1.1 min = 228.68 max = 318.40 avg = 265.74
+ yolo-fastestv2 min = 194.29 max = 258.78 avg = 219.82
+ vision_transformer min = 14836.43 max = 15238.27 avg = 15125.26
+ FastestDet min = 215.60 max = 264.69 avg = 239.85
```
### Qualcomm SM8550-AB Snapdragon 8 Gen 2 (Kyro 3.20 GHz + 2.8 GHz x 2 + 2.80 GHz x 2 + 2.00 GHz * 3 + Adreno 740)
@@ -7670,3 +7760,408 @@ cooling_down = 0
vision_transformer min = 650.85 max = 696.67 avg = 671.13
FastestDet min = 8.63 max = 13.12 avg = 11.39
```
+
+### MediaTek Dimensity 9300 (MT6989) (Cortex-X4 3.25 GHz + 2.85 GHz x 3 + Cortex-A720 2.0 GHz x 4 + Mali-G720-Immortalis MC12)
+```
+k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 8 0 -1 1
+loop_count = 8
+num_threads = 8
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 1.87 max = 2.18 avg = 2.01
+ squeezenet_int8 min = 1.52 max = 1.98 avg = 1.77
+ mobilenet min = 3.02 max = 3.34 avg = 3.15
+ mobilenet_int8 min = 1.90 max = 2.27 avg = 2.04
+ mobilenet_v2 min = 2.72 max = 3.13 avg = 2.89
+ mobilenet_v3 min = 2.20 max = 3.82 avg = 2.78
+ shufflenet min = 1.97 max = 2.56 avg = 2.20
+ shufflenet_v2 min = 1.77 max = 2.29 avg = 1.96
+ mnasnet min = 2.61 max = 3.48 avg = 2.90
+ proxylessnasnet min = 2.72 max = 3.06 avg = 2.89
+ efficientnet_b0 min = 4.57 max = 5.17 avg = 4.89
+ efficientnetv2_b0 min = 5.24 max = 6.72 avg = 5.81
+ regnety_400m min = 4.94 max = 6.78 avg = 5.70
+ blazeface min = 0.80 max = 1.02 avg = 0.91
+ googlenet min = 7.76 max = 8.53 avg = 8.12
+ googlenet_int8 min = 5.68 max = 6.62 avg = 6.19
+ resnet18 min = 5.35 max = 6.06 avg = 5.61
+ resnet18_int8 min = 4.20 max = 4.40 avg = 4.29
+ alexnet min = 5.96 max = 7.30 avg = 6.77
+ vgg16 min = 29.27 max = 30.58 avg = 29.93
+ vgg16_int8 min = 26.72 max = 28.12 avg = 27.27
+ resnet50 min = 15.21 max = 19.16 avg = 16.09
+ resnet50_int8 min = 8.57 max = 9.16 avg = 8.91
+ squeezenet_ssd min = 6.29 max = 7.56 avg = 6.82
+ squeezenet_ssd_int8 min = 5.57 max = 6.96 avg = 6.12
+ mobilenet_ssd min = 6.90 max = 8.90 avg = 7.55
+ mobilenet_ssd_int8 min = 4.53 max = 5.22 avg = 4.86
+ mobilenet_yolo min = 16.88 max = 19.71 avg = 17.88
+ mobilenetv2_yolov3 min = 10.51 max = 14.19 avg = 11.95
+ yolov4-tiny min = 12.81 max = 16.23 avg = 14.22
+ nanodet_m min = 4.38 max = 5.96 avg = 5.19
+ yolo-fastest-1.1 min = 2.22 max = 3.08 avg = 2.73
+ yolo-fastestv2 min = 2.09 max = 2.73 avg = 2.41
+ vision_transformer min = 193.39 max = 203.13 avg = 198.32
+ FastestDet min = 1.98 max = 2.35 avg = 2.16
+k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 4 2 -1 1
+loop_count = 8
+num_threads = 4
+powersave = 2
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 2.23 max = 2.31 avg = 2.27
+ squeezenet_int8 min = 1.68 max = 1.73 avg = 1.70
+ mobilenet min = 3.76 max = 3.86 avg = 3.81
+ mobilenet_int8 min = 2.07 max = 2.16 avg = 2.11
+ mobilenet_v2 min = 2.72 max = 2.95 avg = 2.80
+ mobilenet_v3 min = 2.43 max = 2.51 avg = 2.47
+ shufflenet min = 1.78 max = 1.87 avg = 1.81
+ shufflenet_v2 min = 1.61 max = 1.66 avg = 1.63
+ mnasnet min = 2.69 max = 2.82 avg = 2.76
+ proxylessnasnet min = 2.95 max = 3.13 avg = 3.05
+ efficientnet_b0 min = 4.99 max = 5.29 avg = 5.08
+ efficientnetv2_b0 min = 5.73 max = 5.86 avg = 5.79
+ regnety_400m min = 4.97 max = 5.04 avg = 5.00
+ blazeface min = 1.07 max = 1.17 avg = 1.10
+ googlenet min = 8.51 max = 9.43 avg = 8.75
+ googlenet_int8 min = 6.01 max = 6.13 avg = 6.07
+ resnet18 min = 6.72 max = 7.04 avg = 6.95
+ resnet18_int8 min = 4.31 max = 4.40 avg = 4.34
+ alexnet min = 7.41 max = 7.71 avg = 7.57
+ vgg16 min = 33.77 max = 34.68 avg = 34.08
+ vgg16_int8 min = 32.61 max = 33.83 avg = 33.12
+ resnet50 min = 18.76 max = 19.53 avg = 19.05
+ resnet50_int8 min = 9.56 max = 9.70 avg = 9.61
+ squeezenet_ssd min = 6.86 max = 7.26 avg = 7.01
+ squeezenet_ssd_int8 min = 5.42 max = 6.17 avg = 5.64
+ mobilenet_ssd min = 8.38 max = 9.14 avg = 8.62
+ mobilenet_ssd_int8 min = 4.60 max = 4.90 avg = 4.69
+ mobilenet_yolo min = 19.59 max = 20.06 avg = 19.78
+ mobilenetv2_yolov3 min = 10.46 max = 11.01 avg = 10.70
+ yolov4-tiny min = 13.46 max = 14.18 avg = 13.86
+ nanodet_m min = 4.52 max = 4.59 avg = 4.55
+ yolo-fastest-1.1 min = 1.88 max = 1.94 avg = 1.91
+ yolo-fastestv2 min = 1.73 max = 1.79 avg = 1.76
+ vision_transformer min = 220.32 max = 229.49 avg = 223.92
+ FastestDet min = 1.67 max = 1.73 avg = 1.70
+k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 4 1 -1 1
+loop_count = 8
+num_threads = 4
+powersave = 1
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 3.42 max = 4.25 avg = 3.62
+ squeezenet_int8 min = 2.63 max = 2.78 avg = 2.73
+ mobilenet min = 5.66 max = 6.25 avg = 5.82
+ mobilenet_int8 min = 3.13 max = 5.66 avg = 3.58
+ mobilenet_v2 min = 4.40 max = 4.46 avg = 4.42
+ mobilenet_v3 min = 3.74 max = 4.07 avg = 3.94
+ shufflenet min = 2.77 max = 2.86 avg = 2.82
+ shufflenet_v2 min = 2.52 max = 2.62 avg = 2.57
+ mnasnet min = 4.24 max = 4.37 avg = 4.28
+ proxylessnasnet min = 4.65 max = 4.91 avg = 4.74
+ efficientnet_b0 min = 7.71 max = 10.00 avg = 8.08
+ efficientnetv2_b0 min = 9.24 max = 10.34 avg = 9.87
+ regnety_400m min = 7.87 max = 8.35 avg = 8.02
+ blazeface min = 2.38 max = 2.46 avg = 2.40
+ googlenet min = 13.21 max = 13.78 avg = 13.40
+ googlenet_int8 min = 10.23 max = 10.65 avg = 10.36
+ resnet18 min = 9.25 max = 9.68 avg = 9.49
+ resnet18_int8 min = 6.86 max = 6.97 avg = 6.91
+ alexnet min = 9.73 max = 10.53 avg = 9.97
+ vgg16 min = 47.43 max = 48.12 avg = 47.78
+ vgg16_int8 min = 47.08 max = 48.18 avg = 47.46
+ resnet50 min = 26.82 max = 27.14 avg = 26.99
+ resnet50_int8 min = 15.01 max = 15.57 avg = 15.20
+ squeezenet_ssd min = 9.96 max = 12.66 avg = 10.83
+ squeezenet_ssd_int8 min = 8.47 max = 9.26 avg = 8.88
+ mobilenet_ssd min = 12.54 max = 13.25 avg = 12.82
+ mobilenet_ssd_int8 min = 7.03 max = 10.91 avg = 7.94
+ mobilenet_yolo min = 29.73 max = 30.45 avg = 30.23
+ mobilenetv2_yolov3 min = 16.64 max = 17.71 avg = 17.13
+ yolov4-tiny min = 22.25 max = 22.65 avg = 22.45
+ nanodet_m min = 7.56 max = 7.86 avg = 7.69
+ yolo-fastest-1.1 min = 3.32 max = 3.45 avg = 3.39
+ yolo-fastestv2 min = 2.76 max = 2.96 avg = 2.84
+ vision_transformer min = 328.11 max = 337.26 avg = 332.12
+ FastestDet min = 2.66 max = 2.77 avg = 2.71
+k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 1 2 -1 1
+loop_count = 8
+num_threads = 1
+powersave = 2
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 5.27 max = 5.35 avg = 5.32
+ squeezenet_int8 min = 3.06 max = 3.22 avg = 3.16
+ mobilenet min = 9.59 max = 9.85 avg = 9.74
+ mobilenet_int8 min = 4.29 max = 4.45 avg = 4.37
+ mobilenet_v2 min = 5.14 max = 5.33 avg = 5.20
+ mobilenet_v3 min = 4.28 max = 4.54 avg = 4.42
+ shufflenet min = 3.18 max = 3.34 avg = 3.27
+ shufflenet_v2 min = 2.78 max = 3.23 avg = 3.05
+ mnasnet min = 5.01 max = 5.38 avg = 5.19
+ proxylessnasnet min = 6.11 max = 6.30 avg = 6.21
+ efficientnet_b0 min = 11.53 max = 11.78 avg = 11.66
+ efficientnetv2_b0 min = 13.88 max = 14.28 avg = 14.13
+ regnety_400m min = 8.11 max = 8.18 avg = 8.16
+ blazeface min = 0.99 max = 1.08 avg = 1.01
+ googlenet min = 19.68 max = 20.71 avg = 20.25
+ googlenet_int8 min = 13.42 max = 13.86 avg = 13.60
+ resnet18 min = 18.10 max = 18.84 avg = 18.53
+ resnet18_int8 min = 9.67 max = 10.17 avg = 9.99
+ alexnet min = 15.76 max = 16.35 avg = 16.03
+ vgg16 min = 70.22 max = 72.85 avg = 71.58
+ vgg16_int8 min = 76.83 max = 79.70 avg = 78.45
+ resnet50 min = 39.73 max = 41.24 avg = 40.30
+ resnet50_int8 min = 20.76 max = 21.54 avg = 21.27
+ squeezenet_ssd min = 12.63 max = 18.67 avg = 15.20
+ squeezenet_ssd_int8 min = 10.29 max = 16.13 avg = 14.13
+ mobilenet_ssd min = 17.21 max = 18.43 avg = 17.68
+ mobilenet_ssd_int8 min = 8.92 max = 9.49 avg = 9.07
+ mobilenet_yolo min = 37.45 max = 38.29 avg = 37.88
+ mobilenetv2_yolov3 min = 19.18 max = 19.83 avg = 19.58
+ yolov4-tiny min = 27.06 max = 27.86 avg = 27.45
+ nanodet_m min = 9.33 max = 9.50 avg = 9.42
+ yolo-fastest-1.1 min = 3.48 max = 3.59 avg = 3.54
+ yolo-fastestv2 min = 2.29 max = 2.37 avg = 2.33
+ vision_transformer min = 730.38 max = 739.99 avg = 735.77
+ FastestDet min = 2.40 max = 2.48 avg = 2.43
+k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 64 1 2 0 0
+[0 Mali-G720-Immortalis MC12] queueC=0[2] queueG=0[2] queueT=0[2]
+[0 Mali-G720-Immortalis MC12] bugsbn1=0 bugbilz=0 bugcopc=0 bugihfa=0
+[0 Mali-G720-Immortalis MC12] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1
+[0 Mali-G720-Immortalis MC12] subgroup=16 basic/vote/ballot/shuffle=1/1/1/1
+[0 Mali-G720-Immortalis MC12] fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
+loop_count = 64
+num_threads = 1
+powersave = 2
+gpu_device = 0
+cooling_down = 0
+ squeezenet min = 11.26 max = 13.58 avg = 12.32
+ squeezenet_int8 min = 3.08 max = 3.29 avg = 3.17
+ mobilenet min = 11.96 max = 14.52 avg = 13.48
+ mobilenet_int8 min = 4.20 max = 4.58 avg = 4.34
+ mobilenet_v2 min = 13.62 max = 16.46 avg = 14.62
+ mobilenet_v3 min = 13.98 max = 17.16 avg = 15.25
+ shufflenet min = 10.22 max = 11.82 avg = 11.07
+ shufflenet_v2 min = 12.42 max = 15.39 avg = 14.35
+ mnasnet min = 12.94 max = 16.30 avg = 14.91
+ proxylessnasnet min = 13.18 max = 16.55 avg = 15.05
+ efficientnet_b0 min = 16.70 max = 20.35 avg = 18.27
+ efficientnetv2_b0 min = 54.09 max = 70.05 avg = 58.68
+ regnety_400m min = 16.20 max = 18.42 avg = 17.27
+ blazeface min = 6.50 max = 7.86 avg = 6.93
+ googlenet min = 15.29 max = 17.54 avg = 16.19
+ googlenet_int8 min = 20.38 max = 22.08 avg = 20.98
+ resnet18 min = 12.22 max = 15.63 avg = 14.27
+ resnet18_int8 min = 9.50 max = 10.46 avg = 9.75
+ alexnet min = 12.00 max = 16.09 avg = 13.65
+ vgg16 min = 31.06 max = 32.77 avg = 31.85
+ vgg16_int8 min = 115.72 max = 123.71 avg = 118.23
+ resnet50 min = 15.74 max = 16.53 avg = 16.10
+ resnet50_int8 min = 32.43 max = 33.78 avg = 33.07
+ squeezenet_ssd min = 17.24 max = 21.80 avg = 20.68
+ squeezenet_ssd_int8 min = 9.69 max = 10.52 avg = 9.97
+ mobilenet_ssd min = 15.32 max = 17.63 avg = 16.62
+ mobilenet_ssd_int8 min = 8.84 max = 9.54 avg = 9.05
+ mobilenet_yolo min = 16.67 max = 18.21 avg = 17.25
+ mobilenetv2_yolov3 min = 20.08 max = 25.40 avg = 23.12
+ yolov4-tiny min = 21.98 max = 29.67 avg = 24.75
+ nanodet_m min = 23.19 max = 29.95 avg = 25.69
+ yolo-fastest-1.1 min = 15.07 max = 17.78 avg = 16.49
+ yolo-fastestv2 min = 14.67 max = 16.07 avg = 15.44
+ vision_transformer min = 768.04 max = 801.48 avg = 786.79
+ FastestDet min = 8.33 max = 16.07 avg = 14.38
+```
+
+### Xeon Phi 3120A (1.10 GHz 57-core 228-thread)
+
+- Host: CentOS 7.9
+- Compiler: icc & icpc (ICC) 17.0.2 20170213
+- ncnn tag: 20240102
+
+Build command
+
+```bash
+$ CC=icc CXX=icpc CFLAGS="-mmic" CXXFLAGS="-mmic" cmake .. -DCMAKE_BUILD_TYPE=Release -DNCNN_SSE2=OFF -DNCNN_AVX=OFF -DNCNN_AVX2=OFF
+```
+
+Copy the whole `ncnn` directory and libraries in `/opt/intel/compilers_and_libraries_2017/linux/lib/mic/lib` to `mic0`, then set the `LD_LIBRARY_PATH` environment variable. Some tools cannot be built, but `benchncnn` should work. The built `benchncnn` is for Intel Xeon Phi coprocessor (k1om).
+
+```bash
+[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ file benchncnn
+benchncnn: ELF 64-bit LSB executable, Intel Xeon Phi coprocessor (k1om), version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.32, not stripped
+```
+
+The benchmark is run in the native mode, ssh into the Xeon Phi by `ssh user@mic0`, then run `benckncnn` as under general linux systems.
+
+```
+[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 56 0 -1 1
+loop_count = 4
+num_threads = 56
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 43.42 max = 44.20 avg = 43.64
+ squeezenet_int8 min = 161.92 max = 162.41 avg = 162.15
+ mobilenet min = 44.49 max = 46.90 avg = 45.68
+ mobilenet_int8 min = 230.47 max = 232.40 avg = 231.77
+ mobilenet_v2 min = 57.22 max = 62.03 avg = 59.42
+ mobilenet_v3 min = 301.16 max = 306.62 avg = 303.90
+ shufflenet min = 65.80 max = 70.18 avg = 67.70
+ shufflenet_v2 min = 49.54 max = 53.17 avg = 51.22
+ mnasnet min = 521.87 max = 527.76 avg = 524.63
+ proxylessnasnet min = 745.79 max = 748.55 avg = 746.92
+ efficientnet_b0 min = 582.21 max = 584.64 avg = 583.34
+ efficientnetv2_b0 min = 84.13 max = 86.13 avg = 85.19
+ regnety_400m min = 209.67 max = 214.84 avg = 212.39
+ blazeface min = 26.33 max = 27.39 avg = 26.74
+ googlenet min = 124.14 max = 125.72 avg = 124.83
+ googlenet_int8 min = 498.36 max = 502.37 avg = 500.29
+ resnet18 min = 87.86 max = 88.83 avg = 88.35
+ resnet18_int8 min = 359.50 max = 360.71 avg = 360.11
+ alexnet min = 49.87 max = 51.25 avg = 50.76
+ vgg16 min = 341.87 max = 343.92 avg = 342.42
+ vgg16_int8 min = 1649.34 max = 1655.37 avg = 1652.98
+ resnet50 min = 198.91 max = 202.32 avg = 200.58
+ resnet50_int8 min = 983.48 max = 988.73 avg = 986.22
+ squeezenet_ssd min = 108.33 max = 111.45 avg = 110.18
+ squeezenet_ssd_int8 min = 368.96 max = 370.30 avg = 369.54
+ mobilenet_ssd min = 98.29 max = 101.49 avg = 99.99
+ mobilenet_ssd_int8 min = 462.18 max = 466.20 avg = 464.85
+ mobilenet_yolo min = 262.42 max = 266.84 avg = 263.91
+ mobilenetv2_yolov3 min = 159.20 max = 161.58 avg = 160.66
+ yolov4-tiny min = 229.22 max = 230.48 avg = 229.87
+ nanodet_m min = 115.10 max = 116.78 avg = 115.86
+ yolo-fastest-1.1 min = 154.48 max = 155.33 avg = 154.79
+ yolo-fastestv2 min = 161.10 max = 163.98 avg = 161.88
+ vision_transformer min = 848.51 max = 863.03 avg = 854.92
+ FastestDet min = 251.64 max = 253.22 avg = 252.38
+[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 112 0 -1 1
+loop_count = 4
+num_threads = 112
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 41.07 max = 41.19 avg = 41.12
+ squeezenet_int8 min = 161.73 max = 163.90 avg = 162.74
+ mobilenet min = 36.82 max = 37.53 avg = 37.11
+ mobilenet_int8 min = 231.50 max = 233.81 avg = 232.65
+ mobilenet_v2 min = 53.12 max = 55.87 avg = 54.44
+ mobilenet_v3 min = 277.82 max = 280.61 avg = 279.66
+ shufflenet min = 64.11 max = 64.92 avg = 64.63
+ shufflenet_v2 min = 48.23 max = 50.00 avg = 49.19
+ mnasnet min = 532.09 max = 534.73 avg = 533.34
+ proxylessnasnet min = 760.43 max = 763.94 avg = 762.34
+ efficientnet_b0 min = 534.29 max = 547.51 avg = 541.29
+ efficientnetv2_b0 min = 75.94 max = 76.88 avg = 76.39
+ regnety_400m min = 226.37 max = 227.81 avg = 227.23
+ blazeface min = 26.03 max = 26.93 avg = 26.51
+ googlenet min = 106.53 max = 107.54 avg = 107.06
+ googlenet_int8 min = 503.01 max = 505.16 avg = 504.13
+ resnet18 min = 73.63 max = 76.61 avg = 75.11
+ resnet18_int8 min = 358.18 max = 359.50 avg = 358.99
+ alexnet min = 37.40 max = 38.17 avg = 37.83
+ vgg16 min = 244.95 max = 250.05 avg = 247.24
+ vgg16_int8 min = 1511.89 max = 1512.66 avg = 1512.35
+ resnet50 min = 151.99 max = 154.66 avg = 153.37
+ resnet50_int8 min = 954.16 max = 957.63 avg = 956.55
+ squeezenet_ssd min = 91.46 max = 97.18 avg = 94.00
+ squeezenet_ssd_int8 min = 368.03 max = 375.96 avg = 370.99
+ mobilenet_ssd min = 79.61 max = 81.38 avg = 80.33
+ mobilenet_ssd_int8 min = 458.93 max = 463.41 avg = 461.63
+ mobilenet_yolo min = 234.59 max = 236.91 avg = 235.43
+ mobilenetv2_yolov3 min = 145.82 max = 146.92 avg = 146.23
+ yolov4-tiny min = 219.22 max = 220.51 avg = 219.83
+ nanodet_m min = 109.43 max = 113.94 avg = 112.20
+ yolo-fastest-1.1 min = 158.13 max = 160.59 avg = 159.20
+ yolo-fastestv2 min = 162.05 max = 162.80 avg = 162.47
+ vision_transformer min = 615.14 max = 625.35 avg = 618.47
+ FastestDet min = 279.98 max = 282.49 avg = 281.14
+[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 224 0 -1 1
+loop_count = 4
+num_threads = 224
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 45.54 max = 46.81 avg = 46.13
+ squeezenet_int8 min = 186.81 max = 187.14 avg = 186.97
+ mobilenet min = 38.33 max = 39.11 avg = 38.64
+ mobilenet_int8 min = 251.06 max = 251.91 avg = 251.40
+ mobilenet_v2 min = 56.57 max = 57.15 avg = 56.88
+ mobilenet_v3 min = 365.04 max = 366.87 avg = 365.94
+ shufflenet min = 71.16 max = 72.02 avg = 71.68
+ shufflenet_v2 min = 52.14 max = 53.60 avg = 52.92
+ mnasnet min = 596.37 max = 603.62 avg = 600.50
+ proxylessnasnet min = 911.84 max = 912.23 avg = 912.04
+ efficientnet_b0 min = 611.77 max = 614.32 avg = 612.69
+ efficientnetv2_b0 min = 82.16 max = 83.05 avg = 82.62
+ regnety_400m min = 253.43 max = 255.79 avg = 254.66
+ blazeface min = 30.54 max = 30.91 avg = 30.70
+ googlenet min = 111.68 max = 112.65 avg = 112.11
+ googlenet_int8 min = 594.07 max = 597.09 avg = 596.03
+ resnet18 min = 78.14 max = 79.12 avg = 78.75
+ resnet18_int8 min = 412.69 max = 413.92 avg = 413.46
+ alexnet min = 40.93 max = 41.43 avg = 41.17
+ vgg16 min = 242.45 max = 244.46 avg = 243.47
+ vgg16_int8 min = 1545.61 max = 1548.72 avg = 1547.47
+ resnet50 min = 147.73 max = 148.56 avg = 148.07
+ resnet50_int8 min = 1034.47 max = 1042.31 avg = 1038.41
+ squeezenet_ssd min = 107.82 max = 110.53 avg = 108.98
+ squeezenet_ssd_int8 min = 423.30 max = 426.91 avg = 425.67
+ mobilenet_ssd min = 74.54 max = 77.13 avg = 75.97
+ mobilenet_ssd_int8 min = 510.95 max = 513.33 avg = 512.40
+ mobilenet_yolo min = 238.83 max = 239.64 avg = 239.27
+ mobilenetv2_yolov3 min = 159.80 max = 160.31 avg = 160.04
+ yolov4-tiny min = 233.89 max = 237.41 avg = 236.22
+ nanodet_m min = 122.39 max = 123.42 avg = 122.89
+ yolo-fastest-1.1 min = 194.49 max = 195.25 avg = 194.94
+ yolo-fastestv2 min = 193.06 max = 195.03 avg = 194.05
+ vision_transformer min = 547.36 max = 554.17 avg = 549.99
+ FastestDet min = 317.76 max = 321.38 avg = 320.18
+```
+
+### PhytiumPi, Phytium E2000 (FTC664@1.8GHz x2 + FTC310@1.5GHz x2)
+```
+loop_count = 4
+num_threads = 2
+powersave = 2
+gpu_device = -1
+cooling_down = 1
+ squeezenet min = 43.84 max = 43.95 avg = 43.88
+ squeezenet_int8 min = 35.48 max = 35.77 avg = 35.66
+ mobilenet min = 69.31 max = 70.03 avg = 69.66
+ mobilenet_int8 min = 42.30 max = 42.40 avg = 42.35
+ mobilenet_v2 min = 59.07 max = 59.35 avg = 59.19
+ mobilenet_v3 min = 46.02 max = 46.37 avg = 46.19
+ shufflenet min = 31.52 max = 31.61 avg = 31.56
+ shufflenet_v2 min = 23.99 max = 24.07 avg = 24.04
+ mnasnet min = 49.40 max = 50.45 avg = 49.92
+ proxylessnasnet min = 53.24 max = 53.85 avg = 53.53
+ efficientnet_b0 min = 77.49 max = 77.84 avg = 77.62
+ efficientnetv2_b0 min = 88.51 max = 88.92 avg = 88.69
+ regnety_400m min = 66.99 max = 67.05 avg = 67.03
+ blazeface min = 7.74 max = 8.14 avg = 7.98
+ googlenet min = 126.62 max = 127.23 avg = 126.91
+ googlenet_int8 min = 102.87 max = 103.16 avg = 103.01
+ resnet18 min = 102.28 max = 102.63 avg = 102.48
+ resnet18_int8 min = 72.01 max = 72.45 avg = 72.29
+ alexnet min = 76.00 max = 124.61 avg = 88.24
+ vgg16 min = 597.75 max = 601.99 avg = 599.44
+ vgg16_int8 min = 421.40 max = 423.83 avg = 423.01
+ resnet50 min = 278.16 max = 280.64 avg = 279.37
+ resnet50_int8 min = 207.26 max = 207.47 avg = 207.36
+ squeezenet_ssd min = 108.69 max = 109.26 avg = 108.99
+ squeezenet_ssd_int8 min = 84.05 max = 84.60 avg = 84.28
+ mobilenet_ssd min = 141.65 max = 142.46 avg = 142.14
+ mobilenet_ssd_int8 min = 84.43 max = 84.99 avg = 84.73
+ mobilenet_yolo min = 322.53 max = 325.15 avg = 323.51
+ mobilenetv2_yolov3 min = 194.84 max = 196.98 avg = 196.07
+ yolov4-tiny min = 208.29 max = 213.26 avg = 210.77
+ nanodet_m min = 64.78 max = 65.38 avg = 65.08
+ yolo-fastest-1.1 min = 37.89 max = 38.23 avg = 38.07
+ yolo-fastestv2 min = 29.75 max = 30.33 avg = 30.09
+ vision_transformer min = 4257.71 max = 4263.73 avg = 4260.60
+ FastestDet min = 30.86 max = 44.67 avg = 34.41
+```
diff --git a/benchmark/RankCards/README.md b/benchmark/RankCards/README.md
index 1db9ced86ae..00cb164e50c 100644
--- a/benchmark/RankCards/README.md
+++ b/benchmark/RankCards/README.md
@@ -5,79 +5,88 @@ The set is then compared to a reference set by calculating the ratio of each mod
Finally, the boards are ranked from fast to slow.
| | Board | Ratio |
| :--: | :---- | :--- |
-| 1 | NVIDIA Quadro RTX 8000 (TU102 SM x 72 + Tensor Core x 576) | 0.123 |
-| 2 | nVIDIA RTX2080 of Desktop | 0.126 |
-| 3 | NVIDIA GeForce RTX 3060 Ti of Desktop[2023-10-12] | 0.136 |
-| 4 | nVIDIA RTX2060 of Notebook | 0.167 |
-| 5 | Intel® Core™ i7-13700K of Desktop[2023-10-12] | 0.199 |
-| 6 | NVIDIA RTX3090 (GA102 SM x 82 + Tensor Core 328) | 0.204 |
-| 7 | AMD Radeon RX 6900 XT of Desktop[2023-10-12] | 0.211 |
-| 8 | MacBook Pro (13-inch, M1, 2020) | 0.26 |
-| 9 | AWS c5.4xlarge Instance (Intel Xeon Platinum 8124M @ 3.399GHz, Ubuntu 20.04.6 LTS x86_64) | 0.317 |
-| 10 | AMD Ryzen 9 5950X 16-Core of Desktop[2023-10-12] | 0.333 |
-| 11 | NVIDIA Jetson AGX Orin (Cortex-A78AE 2.2 GHz x 12 + Ampere@1.3 GHz Tensor Cores 64) | 0.37 |
-| 12 | AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8) | 0.37 |
-| 13 | HUAWEI KunPeng 920 3211K (x24 cores) | 0.377 |
-| 14 | HUAWEI KunPeng 920 2251K (x8 cores) | 0.418 |
-| 15 | nVIDIA RTX A3000 of Notebook (6GB) | 0.434 |
-| 16 | Intel(R) UHD Graphics 770 of Desktop[2023-10-12] | 0.473 |
-| 17 | OrangePi5, Rockchip RK3588s (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.492 |
-| 18 | Qualcomm SM8150-AC Snapdragon 855+ (Kyro485 2.96 GHz + 2.42 GHz x 3 + 1.80 GHz x 4 + Adreno 640) | 0.498 |
-| 19 | Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.583 |
-| 20 | NVIDIA Jetson Orin Nano | 0.641 |
-| 21 | Station-M3/ROC-RK3588S-PC, Rockchip RK3588S (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz + Mali-G610) StationOS (Android) | 0.696 |
-| 22 | NVIDIA Jetson AGX Xavier (Carmel 2.2 GHz x 8 + Volta Tensor Cores 64) | 0.809 |
-| 23 | Loongson 3A6000 (LA664 2.5GHz * 4+4) | 0.86 |
-| 24 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-11800H, NVIDIA GeForce RTX 3070 Laptop GPU) | 0.898 |
-| 25 | Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8) (2.4GHz x 4) | 1 |
-| 26 | Rockchip RK3588 (Cortex-A76 2.4GHz x 4 + Cortex-A55 1.8GHz x 4) | 1.01 |
-| 27 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-7700K, NVIDIA GeForce GTX 1050 Ti) | 1.24 |
-| 28 | Phytium FT-2000+/64 (FTC662 armv8 2.4GHz x 8) | 1.36 |
-| 29 | AMD Ryzen Embedded V1605B (Zen 2.0 GHz ~ 3.6 GHz x 4 + Radeon Vega 8 1.1GHz 8CU) | 1.62 |
-| 30 | Avaota Aim T527, Allwinner T527 (Cortex-A55 2.2GHz x 4 + Cortex-A55 1.8GHz x 4) | 1.75 |
-| 31 | Loongson 3A5000 (LA464 2.5GHz * 4) | 1.79 |
-| 32 | NVIDIA Jetson Nano | 1.82 |
-| 33 | Qualcomm MSM8996 Pro Snapdragon 821 (Kyro 2.35GHz x 2 + Kyro 2.19GHz x 2) | 1.83 |
-| 34 | AMD Ryzen Threadripper 3970X (Zen2 3.7 GHz ~ 4.5 GHz x 32) | 1.83 |
-| 35 | Intel Celeron N5105 | 2.12 |
-| 36 | MacBook Pro (15-inch, 2019) - 2.6GHz six cores Intel Core i7 && Radeon Pro 555X 4GB && Intel UHD Graphics 630 1536MB | 2.49 |
-| 37 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 2.54 |
-| 38 | Khadas VIM3, Amlogic A311D (Cortex-A73 2.2GHz x 4 + Cortex-A53 1.8GHz x 2) | 2.71 |
-| 39 | Kirin 970 (Cortex-A73 2.4GHz x 4 + Cortex-A53 1.8GHz x 4) | 2.74 |
-| 40 | Qualcomm MSM8998 Snapdragon 835 (Kyro 2.45GHz x 4 + Kyro 1.9GHz x 4 + Adreno 540) | 2.77 |
-| 41 | Qualcomm MSM6150 Snapdragon 675 (Kyro460 2.0GHz x 2 + Kyro460 1.7GHz x 6 + Adreno 612) | 2.81 |
-| 42 | Station P2, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) | 2.9 |
-| 43 | Rock3A, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) ubuntu 20.04 | 2.93 |
-| 44 | Qualcomm MSM8994 Snapdragon 810 (Cortex-A57 2.0GHz x 4 + Cortex-A53 1.55GHz x 4) | 2.95 |
-| 45 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 3.2 |
-| 46 | Raspberry Pi 4 Model B Broadcom BCM2711B0, Cortex-A72 (ARMv8) (1.8GHz x 4) | 3.88 |
-| 47 | Qualcomm SDM660 Snapdragon 660 (Kyro260 2.2GHz x 4 + Kyro260 1.84GHz x 4 + Adreno 512) | 3.96 |
-| 48 | Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 3.99 |
-| 49 | OrangePi4 LTS, Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 4 |
-| 50 | Phytium FT-2000/4 (FTC663 armv8 2.2GHz x 4) | 4.13 |
-| 51 | RDK X3 Module (Cortex-A53 1.5GHz x 4) aarch64 | 4.6 |
-| 52 | Station-M2/ROC-RK3566-PC, Rockchip RK3566 (Cortex-A55 1.8GHz x 4 + Mali-G52) StationOS (Android) | 4.92 |
-| 53 | Rockchip RK3288-CG.W (Cortex-A17 1.8GHz x 4) | 5.21 |
-| 54 | Intel Atom x5-Z8350 | 5.83 |
-| 55 | Qualcomm MSM8916 Snapdragon 410 (Cortex-A53 1.2GHz x 4) | 5.93 |
-| 56 | NanoPi R2S, Rockchip RK3328 (Cortex-A53 1.3GHz x 4) Armbian focal (21.05.1) aarch64 | 5.93 |
-| 57 | EAIDK 310, Rockchip RK3228H (Cortex-A53 1.3GHz x 4) fedora-28 aarch64 | 6.53 |
-| 58 | Raspberry Pi 3 Model B+ Broadcom BCM2837B0, Cortex-A53 (ARMv8) (1.4GHz x 4) | 7.71 |
-| 59 | iPhone 5S (Apple A7 1.3GHz x 2) | 8.1 |
-| 60 | Raspberry Pi 5 Broadcom BCM2712, VideoCore VII Graphics (Vulkan 1.2) | 10.3 |
-| 61 | Raspberry Pi Zero 2 W Broadcom BCM2710A1, Cortex-A53 (ARMv8) (1.0GHz x 4) | 10.7 |
-| 62 | Loongson 3A3000 (GS464E 1.45GHz * 4) | 12.6 |
-| 63 | AXERA AX620A (Cortex-A7 1.0GHz * 4) | 14.8 |
-| 64 | Loongson 2K1000 (GS264 1.0GHz x 2) | 19 |
-| 65 | Loongson 2K1000LA (LA264 1.0GHz * 2) | 19 |
-| 66 | Banana Pi M2 Zero 2 AllWinner H2+, Cortex-A7 (ARMv7-A) (1.2GHz x 4) | 21.1 |
-| 67 | Freescale i.MX7 Dual (Cortex A7 1.0GHz x 2) | 21.6 |
-| 68 | Sunway SW831 (sw_64 2.5GHz * 8) | 31.2 |
-| 69 | Intel Celeron M 420 (Yonah 1.60 GHz x 1) | 33.4 |
-| 70 | HiSilicon Hi3519V101 (Cortex-A17 1.2GHz x 1) | 34 |
-| 71 | Z7-Lite 7020 XC7Z020CLG400-2 (Cortex-A9 766MHz x 2) | 35.2 |
-| 72 | Amlogic S805 (Cortex-A5, 4 × 1.536GHz) | 46 |
-| 73 | VisionFive2 , JH7110 (SiFive-U74(RV64GC) 1.5GHz x 4) riscv64 | 52 |
-| 74 | T-Head TH1520 (C910V, 1.848 GHz x 4 + BXM-4-64 PowerVR) | 60.5 |
-| 75 | Sunway SW421 (sw_64 1.7GHz * 4) | 87.7 |
-| 76 | Ingenic T40XP Xburst2 Core X2 1.4Ghz (without MSA) | 132 |
+| 1 | NVIDIA Quadro RTX 8000 (TU102 SM x 72 + Tensor Core x 576) | 0.147 |
+| 2 | nVIDIA RTX2080 of Desktop | 0.15 |
+| 3 | NVIDIA GeForce RTX 3060 Ti of Desktop[2023-10-12] | 0.18 |
+| 4 | nVIDIA RTX2060 of Notebook | 0.198 |
+| 5 | Intel® Core™ i7-13700K of Desktop[2023-10-12] | 0.255 |
+| 6 | AMD Radeon RX 6900 XT of Desktop[2023-10-12] | 0.275 |
+| 7 | NVIDIA RTX3090 (GA102 SM x 82 + Tensor Core 328) | 0.277 |
+| 8 | MediaTek Dimensity 9300 (MT6989) (Cortex-X4 3.25 GHz + 2.85 GHz x 3 + Cortex-A720 2.0 GHz x 4 + Mali-G720-Immortalis MC12) | 0.309 |
+| 9 | MacBook Pro (13-inch, M1, 2020) | 0.346 |
+| 10 | AWS c5.4xlarge Instance | 0.418 |
+| 11 | AMD Ryzen 9 5950X 16-Core of Desktop[2023-10-12] | 0.427 |
+| 12 | Qualcomm SM8550-AB Snapdragon 8 Gen 2 (Kyro 3.20 GHz + 2.8 GHz x 2 + 2.80 GHz x 2 + 2.00 GHz * 3 + Adreno 740) | 0.45 |
+| 13 | AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8) | 0.478 |
+| 14 | HUAWEI KunPeng 920 3211K (x24 cores) | 0.482 |
+| 15 | NVIDIA Jetson AGX Orin (Cortex-A78AE 2.2 GHz x 12 + Ampere@1.3 GHz Tensor Cores 64) | 0.485 |
+| 16 | HUAWEI KunPeng 920 2251K (x8 cores) | 0.54 |
+| 17 | nVIDIA RTX A3000 of Notebook (6GB) | 0.577 |
+| 18 | Intel(R) UHD Graphics 770 of Desktop[2023-10-12] | 0.593 |
+| 19 | OrangePi5, Rockchip RK3588s (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.642 |
+| 20 | Qualcomm SM8150-AC Snapdragon 855+ (Kyro485 2.96 GHz + 2.42 GHz x 3 + 1.80 GHz x 4 + Adreno 640) | 0.665 |
+| 21 | Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.753 |
+| 22 | NVIDIA Jetson Orin Nano | 0.819 |
+| 23 | Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8) (2.4GHz x 4) | 1 |
+| 24 | Station-M3/ROC-RK3588S-PC, Rockchip RK3588S (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz + Mali-G610) StationOS (Android) | 1 |
+| 25 | NVIDIA Jetson AGX Xavier (Carmel 2.2 GHz x 8 + Volta Tensor Cores 64) | 1.05 |
+| 26 | Loongson 3A6000 (LA664 2.5GHz * 4+4) | 1.11 |
+| 27 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-11800H, NVIDIA GeForce RTX 3070 Laptop GPU) | 1.19 |
+| 28 | Rockchip RK3588 (Cortex-A76 2.4GHz x 4 + Cortex-A55 1.8GHz x 4) | 1.35 |
+| 29 | NVIDIA Jetson TX2 NX(NV-Denver2 2.0Ghz x 2 + Cortex-A57 2.0Ghz x 4 + 256-core NVIDIA Pascal iGPU) | 1.59 |
+| 30 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-7700K, NVIDIA GeForce GTX 1050 Ti) | 1.66 |
+| 31 | Phytium FT-2000+/64 (FTC662 armv8 2.4GHz x 8) | 1.75 |
+| 32 | AMD Ryzen Threadripper 3970X (Zen2 3.7 GHz ~ 4.5 GHz x 32) | 2.19 |
+| 33 | AMD Ryzen Embedded V1605B (Zen 2.0 GHz ~ 3.6 GHz x 4 + Radeon Vega 8 1.1GHz 8CU) | 2.23 |
+| 34 | Avaota Aim T527, Allwinner T527 (Cortex-A55 2.2GHz x 4 + Cortex-A55 1.8GHz x 4) | 2.28 |
+| 35 | Loongson 3A5000 (LA464 2.5GHz * 4) | 2.31 |
+| 36 | Qualcomm MSM8996 Pro Snapdragon 821 (Kyro 2.35GHz x 2 + Kyro 2.19GHz x 2) | 2.37 |
+| 37 | NVIDIA Jetson Nano | 2.44 |
+| 38 | Intel Celeron N5105 | 2.8 |
+| 39 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 3.24 |
+| 40 | Khadas VIM3, Amlogic A311D (Cortex-A73 2.2GHz x 4 + Cortex-A53 1.8GHz x 2) | 3.48 |
+| 41 | Kirin 970 (Cortex-A73 2.4GHz x 4 + Cortex-A53 1.8GHz x 4) | 3.58 |
+| 42 | Qualcomm MSM8998 Snapdragon 835 (Kyro 2.45GHz x 4 + Kyro 1.9GHz x 4 + Adreno 540) | 3.63 |
+| 43 | MacBook Pro (15-inch, 2019) - 2.6GHz six cores Intel Core i7 && Radeon Pro 555X 4GB && Intel UHD Graphics 630 1536MB | 3.75 |
+| 44 | Qualcomm MSM6150 Snapdragon 675 (Kyro460 2.0GHz x 2 + Kyro460 1.7GHz x 6 + Adreno 612) | 3.75 |
+| 45 | Qualcomm MSM8994 Snapdragon 810 (Cortex-A57 2.0GHz x 4 + Cortex-A53 1.55GHz x 4) | 3.82 |
+| 46 | Station P2, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) | 3.85 |
+| 47 | Rock3A, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) ubuntu 20.04 | 3.86 |
+| 48 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 4.08 |
+| 49 | Radxa Zero 3W, Cortex-A55 (ARMv82) (1.416 GHz x 4) | 4.5 |
+| 50 | Raspberry Pi 4 Model B Broadcom BCM2711B0, Cortex-A72 (ARMv8) (1.8GHz x 4) | 4.95 |
+| 51 | OrangePi4 LTS, Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 5.11 |
+| 52 | Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 5.16 |
+| 53 | PhytiumPi, Phytium E2000 (FTC664@1.8GHz x2 + FTC310@1.5GHz x2) | 5.16 |
+| 54 | Qualcomm SDM660 Snapdragon 660 (Kyro260 2.2GHz x 4 + Kyro260 1.84GHz x 4 + Adreno 512) | 5.26 |
+| 55 | Phytium FT-2000/4 (FTC663 armv8 2.2GHz x 4) | 5.27 |
+| 56 | RDK X3 Module (Cortex-A53 1.5GHz x 4) aarch64 | 5.88 |
+| 57 | Station-M2/ROC-RK3566-PC, Rockchip RK3566 (Cortex-A55 1.8GHz x 4 + Mali-G52) StationOS (Android) | 6.51 |
+| 58 | Rockchip RK3288-CG.W (Cortex-A17 1.8GHz x 4) | 6.66 |
+| 59 | Qualcomm MSM8916 Snapdragon 410 (Cortex-A53 1.2GHz x 4) | 7.63 |
+| 60 | NanoPi R2S, Rockchip RK3328 (Cortex-A53 1.3GHz x 4) Armbian focal (21.05.1) aarch64 | 7.66 |
+| 61 | Intel Atom x5-Z8350 | 7.74 |
+| 62 | Loongson 2K2000 (LA364 1.5GHz * 2 with lsx) | 8.23 |
+| 63 | EAIDK 310, Rockchip RK3228H (Cortex-A53 1.3GHz x 4) fedora-28 aarch64 | 8.34 |
+| 64 | OrangePi Zero 2, Allwinner H616 (Cortex-A53 1.5GHz x 4) | 9.51 |
+| 65 | Raspberry Pi 3 Model B+ Broadcom BCM2837B0, Cortex-A53 (ARMv8) (1.4GHz x 4) | 9.87 |
+| 66 | iPhone 5S (Apple A7 1.3GHz x 2) | 11 |
+| 67 | MYIR RemiPi,Renesas RZG2L(Cortex-A55 1.5GHz x 2) | 11.9 |
+| 68 | Raspberry Pi 5 Broadcom BCM2712, VideoCore VII Graphics (Vulkan 1.2) | 12.5 |
+| 69 | Raspberry Pi Zero 2 W Broadcom BCM2710A1, Cortex-A53 (ARMv8) (1.0GHz x 4) | 13.7 |
+| 70 | Xeon Phi 3120A (1.10 GHz 57-core 228-thread) | 15.1 |
+| 71 | Loongson 3A3000 (GS464E 1.45GHz * 4) | 16.3 |
+| 72 | AXERA AX620A (Cortex-A7 1.0GHz * 4) | 18.8 |
+| 73 | Loongson 2K1000LA (LA264 1.0GHz * 2) | 24.4 |
+| 74 | Loongson 2K1000 (GS264 1.0GHz x 2) | 24.8 |
+| 75 | Freescale i.MX7 Dual (Cortex A7 1.0GHz x 2) | 26.7 |
+| 76 | Banana Pi M2 Zero 2 AllWinner H2+, Cortex-A7 (ARMv7-A) (1.2GHz x 4) | 26.8 |
+| 77 | HiSilicon Hi3519V101 (Cortex-A17 1.2GHz x 1) | 36.2 |
+| 78 | Sunway SW831 (sw_64 2.5GHz * 8) | 40.7 |
+| 79 | Z7-Lite 7020 XC7Z020CLG400-2 (Cortex-A9 766MHz x 2) | 43.2 |
+| 80 | Intel Celeron M 420 (Yonah 1.60 GHz x 1) | 43.9 |
+| 81 | Amlogic S805 (Cortex-A5, 4 × 1.536GHz) | 45.9 |
+| 82 | VisionFive2 , JH7110 (SiFive-U74(RV64GC) 1.5GHz x 4) riscv64 with PowerVR B-Series BXE-4-32 | 72.4 |
+| 83 | T-Head TH1520 (C910V, 1.848 GHz x 4 + BXM-4-64 PowerVR) | 83.3 |
+| 84 | Sunway SW421 (sw_64 1.7GHz * 4) | 116 |
+| 85 | Ingenic T40XP Xburst2 Core X2 1.4Ghz (without MSA) | 165 |
diff --git a/build-android.cmd b/build-android.cmd
index 0c4262a37d0..b621dae6c1a 100644
--- a/build-android.cmd
+++ b/build-android.cmd
@@ -2,40 +2,22 @@
@ECHO OFF
@SETLOCAL
@SET ANDROID_NDK=
-@SET VULKAN_SDK=
:: Set ninja.exe
:: @SET NINJA_EXE=
:: android armv7
-mkdir build-android-armv7
-pushd build-android-armv7
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 ..
-:: cmake -G Ninja -DCMAKE_TOOLCHAIN_FILE="%ANDROID_NDK%/build/cmake/android.toolchain.cmake" -DCMAKE_MAKE_PROGRAM=%NINJA_EXE% -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 ..
-cmake --build . --parallel %NUMBER_OF_PROCESSORS%
-cmake --build . --target install
-popd
-
-:: android armv7 vulkan
mkdir build-android-armv7-vulkan
pushd build-android-armv7-vulkan
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd
:: android aarch64
-mkdir build-android-aarch64
-pushd build-android-aarch64
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 ..
-cmake --build . --parallel %NUMBER_OF_PROCESSORS%
-cmake --build . --target install
-popd
-
-:: android aarch64 vulkan
mkdir build-android-aarch64-vulkan
pushd build-android-aarch64-vulkan
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd
@@ -43,7 +25,7 @@ popd
:: android x86
mkdir build-android-x86
pushd build-android-x86
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 ..
+cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd
@@ -51,7 +33,7 @@ popd
:: android x86_64
mkdir build-android-x86_64
pushd build-android-x86_64
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 ..
+cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd
diff --git a/build.sh b/build.sh
index 20a96eae2d3..754aaf8a4cd 100755
--- a/build.sh
+++ b/build.sh
@@ -1,9 +1,17 @@
#!/usr/bin/env bash
+##### android armv7 without neon
+mkdir -p build-android-armv7-without-neon
+pushd build-android-armv7-without-neon
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
+make -j4
+make install
+popd
+
##### android armv7
mkdir -p build-android-armv7
pushd build-android-armv7
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 ..
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
make -j4
make install
popd
@@ -11,15 +19,7 @@ popd
##### android aarch64
mkdir -p build-android-aarch64
pushd build-android-aarch64
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 ..
-make -j4
-make install
-popd
-
-##### android armv7 without neon
-mkdir -p build-android-armv7-without-neon
-pushd build-android-armv7-without-neon
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-19 ..
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
make -j4
make install
popd
@@ -27,7 +27,7 @@ popd
##### android x86
mkdir -p build-android-x86
pushd build-android-x86
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 ..
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
make -j4
make install
popd
@@ -35,39 +35,7 @@ popd
##### android x86_64
mkdir -p build-android-x86_64
pushd build-android-x86_64
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 ..
-make -j4
-make install
-popd
-
-##### android armv7 vulkan
-mkdir -p build-android-armv7-vulkan
-pushd build-android-armv7-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
-##### android aarch64 vulkan
-mkdir -p build-android-aarch64-vulkan
-pushd build-android-aarch64-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
-##### android x86 vulkan
-mkdir -p build-android-x86-vulkan
-pushd build-android-x86-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
-##### android x86_64 vulkan
-mkdir -p build-android-x86_64-vulkan
-pushd build-android-x86_64-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
make -j4
make install
popd
@@ -144,70 +112,6 @@ make -j4
make install
popd
-##### ios armv7 arm64
-mkdir -p build-ios
-pushd build-ios
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc.toolchain.cmake -DENABLE_BITCODE=OFF ..
-make -j4
-make install
-popd
-
-##### ios armv7 arm64 bitcode
-mkdir -p build-ios-bitcode
-pushd build-ios-bitcode
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc.toolchain.cmake -DENABLE_BITCODE=ON ..
-make -j4
-make install
-popd
-
-##### ios simulator i386 x86_64
-mkdir -p build-ios-sim
-pushd build-ios-sim
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc.toolchain.cmake -DENABLE_BITCODE=OFF ..
-make -j4
-make install
-popd
-
-##### ios simulator i386 x86_64 bitcode
-mkdir -p build-ios-sim-bitcode
-pushd build-ios-sim-bitcode
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc.toolchain.cmake -DENABLE_BITCODE=ON ..
-make -j4
-make install
-popd
-
-##### ios arm64 vulkan
-mkdir -p build-ios-vulkan
-pushd build-ios-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc-arm64.toolchain.cmake -DENABLE_BITCODE=OFF -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
-##### ios arm64 vulkan bitcode
-mkdir -p build-ios-vulkan-bitcode
-pushd build-ios-vulkan-bitcode
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc-arm64.toolchain.cmake -DENABLE_BITCODE=ON -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
-##### ios simulator x86_64 vulkan
-mkdir -p build-ios-sim-vulkan
-pushd build-ios-sim-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc-x64.toolchain.cmake -DENABLE_BITCODE=OFF -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
-make
-make install
-popd
-
-##### ios simulator x86_64 vulkan bitcode
-mkdir -p build-ios-sim-vulkan-bitcode
-pushd build-ios-sim-vulkan-bitcode
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc-x64.toolchain.cmake -DENABLE_BITCODE=ON -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
##### MacOS
mkdir -p build-mac
pushd build-mac
diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
index e6c74fec5eb..4eeedb010c7 100644
--- a/cmake/ncnn_add_layer.cmake
+++ b/cmake/ncnn_add_layer.cmake
@@ -31,35 +31,14 @@ macro(ncnn_add_arch_opt_layer class NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CF
list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER} ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE})
# generate layer_declaration and layer_registry file
- set(layer_declaration "${layer_declaration}#include \"layer/${name}.h\"\n")
- set(layer_declaration_class "class ${class}_final_${NCNN_TARGET_ARCH_OPT} : virtual public ${class}")
- set(create_pipeline_content " { int ret = ${class}::create_pipeline(opt); if (ret) return ret; }\n")
- set(destroy_pipeline_content " { int ret = ${class}::destroy_pipeline(opt); if (ret) return ret; }\n")
-
- if(WITH_LAYER_${name}_vulkan)
- set(layer_declaration "${layer_declaration}#include \"layer/vulkan/${name}_vulkan.h\"\n")
- set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_vulkan")
- set(create_pipeline_content "${create_pipeline_content} if (vkdev) { int ret = ${class}_vulkan::create_pipeline(opt); if (ret) return ret; }\n")
- set(destroy_pipeline_content " if (vkdev) { int ret = ${class}_vulkan::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
- endif()
-
set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.h\"\n")
- set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}")
- set(create_pipeline_content "${create_pipeline_content} { int ret = ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}::create_pipeline(opt); if (ret) return ret; }\n")
- set(destroy_pipeline_content " { int ret = ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
-
- set(layer_declaration "${layer_declaration}namespace ncnn {\n${layer_declaration_class}\n{\n")
- set(layer_declaration "${layer_declaration}public:\n")
- set(layer_declaration "${layer_declaration} virtual int create_pipeline(const Option& opt) {\n${create_pipeline_content} return 0;\n }\n")
- set(layer_declaration "${layer_declaration} virtual int destroy_pipeline(const Option& opt) {\n${destroy_pipeline_content} return 0;\n }\n")
- set(layer_declaration "${layer_declaration}};\n")
- set(layer_declaration "${layer_declaration}DEFINE_LAYER_CREATOR(${class}_final_${NCNN_TARGET_ARCH_OPT})\n} // namespace ncnn\n\n")
-
- set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_final_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#else\n{${class}_final_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#endif\n")
+ set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}) }\n")
+
+ set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#else\n{${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#endif\n")
else()
# no isa optimized version
if(WITH_LAYER_${name})
- set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_final_layer_creator},\n#else\n{${class}_final_layer_creator},\n#endif\n")
+ set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
else()
set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
endif()
@@ -110,18 +89,21 @@ macro(ncnn_add_layer class)
# generate layer_declaration and layer_registry file
if(WITH_LAYER_${name})
set(layer_declaration "${layer_declaration}#include \"layer/${name}.h\"\n")
- set(layer_declaration_class "class ${class}_final : virtual public ${class}")
- set(create_pipeline_content " { int ret = ${class}::create_pipeline(opt); if (ret) return ret; }\n")
- set(destroy_pipeline_content " { int ret = ${class}::destroy_pipeline(opt); if (ret) return ret; }\n")
+ set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}) }\n")
source_group ("sources\\\\layers" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp")
endif()
+ if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
+ set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h\"\n")
+ set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}) }\n")
+
+ source_group ("sources\\\\layers\\\\${NCNN_TARGET_ARCH}" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp")
+ endif()
+
if(WITH_LAYER_${name}_vulkan)
set(layer_declaration "${layer_declaration}#include \"layer/vulkan/${name}_vulkan.h\"\n")
- set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_vulkan")
- set(create_pipeline_content "${create_pipeline_content} if (vkdev) { int ret = ${class}_vulkan::create_pipeline(opt); if (ret) return ret; }\n")
- set(destroy_pipeline_content " if (vkdev) { int ret = ${class}_vulkan::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
+ set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_vulkan) }\n")
file(GLOB_RECURSE NCNN_SHADER_SRCS "layer/vulkan/shader/${name}.comp")
file(GLOB_RECURSE NCNN_SHADER_SUBSRCS "layer/vulkan/shader/${name}_*.comp")
@@ -133,28 +115,22 @@ macro(ncnn_add_layer class)
source_group ("sources\\\\layers\\\\vulkan" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/${name}_vulkan.cpp")
endif()
- if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
- set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h\"\n")
- set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_${NCNN_TARGET_ARCH}")
- set(create_pipeline_content "${create_pipeline_content} { int ret = ${class}_${NCNN_TARGET_ARCH}::create_pipeline(opt); if (ret) return ret; }\n")
- set(destroy_pipeline_content " { int ret = ${class}_${NCNN_TARGET_ARCH}::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
-
- source_group ("sources\\\\layers\\\\${NCNN_TARGET_ARCH}" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp")
+ if(WITH_LAYER_${name})
+ set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
+ else()
+ set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
endif()
- if(WITH_LAYER_${name})
- set(layer_declaration "${layer_declaration}namespace ncnn {\n${layer_declaration_class}\n{\n")
- set(layer_declaration "${layer_declaration}public:\n")
- set(layer_declaration "${layer_declaration} virtual int create_pipeline(const Option& opt) {\n${create_pipeline_content} return 0;\n }\n")
- set(layer_declaration "${layer_declaration} virtual int destroy_pipeline(const Option& opt) {\n${destroy_pipeline_content} return 0;\n }\n")
- set(layer_declaration "${layer_declaration}};\n")
- set(layer_declaration "${layer_declaration}DEFINE_LAYER_CREATOR(${class}_final)\n} // namespace ncnn\n\n")
+ if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
+ set(layer_registry_arch "${layer_registry_arch}#if NCNN_STRING\n{\"${class}\", ${class}_${NCNN_TARGET_ARCH}_layer_creator},\n#else\n{${class}_${NCNN_TARGET_ARCH}_layer_creator},\n#endif\n")
+ else()
+ set(layer_registry_arch "${layer_registry_arch}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
endif()
- if(WITH_LAYER_${name})
- set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", ${class}_final_layer_creator},\n#else\n{${class}_final_layer_creator},\n#endif\n")
+ if(WITH_LAYER_${name}_vulkan)
+ set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", ${class}_vulkan_layer_creator},\n#else\n{${class}_vulkan_layer_creator},\n#endif\n")
else()
- set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
+ set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
endif()
if(NCNN_TARGET_ARCH STREQUAL "x86")
diff --git a/cmake/ncnn_add_shader.cmake b/cmake/ncnn_add_shader.cmake
index 8006241bc05..76680f4ca81 100644
--- a/cmake/ncnn_add_shader.cmake
+++ b/cmake/ncnn_add_shader.cmake
@@ -1,7 +1,7 @@
macro(ncnn_add_shader NCNN_SHADER_SRC)
get_filename_component(NCNN_SHADER_SRC_NAME_WE ${NCNN_SHADER_SRC} NAME_WE)
- set(NCNN_SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${NCNN_SHADER_SRC_NAME_WE}.comp.hex.h)
+ set(NCNN_SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${NCNN_SHADER_SRC_NAME_WE}.comp.hex.h)
add_custom_command(
OUTPUT ${NCNN_SHADER_COMP_HEADER}
@@ -13,7 +13,7 @@ macro(ncnn_add_shader NCNN_SHADER_SRC)
set_source_files_properties(${NCNN_SHADER_COMP_HEADER} PROPERTIES GENERATED TRUE)
get_filename_component(NCNN_SHADER_COMP_HEADER_NAME ${NCNN_SHADER_COMP_HEADER} NAME)
- string(APPEND layer_shader_spv_data "#include \"${NCNN_SHADER_COMP_HEADER_NAME}\"\n")
+ string(APPEND layer_shader_spv_data "#include \"layer/vulkan/shader/${NCNN_SHADER_COMP_HEADER_NAME}\"\n")
get_filename_component(NCNN_SHADER_SRC_NAME_WE ${NCNN_SHADER_SRC} NAME_WE)
string(APPEND layer_shader_registry "{${NCNN_SHADER_SRC_NAME_WE}_comp_data,sizeof(${NCNN_SHADER_SRC_NAME_WE}_comp_data)},\n")
diff --git a/cmake/ncnn_generate_shader_comp_header.cmake b/cmake/ncnn_generate_shader_comp_header.cmake
index a41b6328d8d..79f7c1eff3b 100644
--- a/cmake/ncnn_generate_shader_comp_header.cmake
+++ b/cmake/ncnn_generate_shader_comp_header.cmake
@@ -18,8 +18,8 @@ string(REGEX REPLACE "\n\n" "\n" comp_data "${comp_data}")
get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
# text to hex
-file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.text2hex.txt "${comp_data}")
-file(READ ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.text2hex.txt comp_data_hex HEX)
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${SHADER_SRC_NAME_WE}.text2hex.txt "${comp_data}")
+file(READ ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${SHADER_SRC_NAME_WE}.text2hex.txt comp_data_hex HEX)
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," comp_data_hex ${comp_data_hex})
string(FIND "${comp_data_hex}" "," tail_comma REVERSE)
string(SUBSTRING "${comp_data_hex}" 0 ${tail_comma} comp_data_hex)
diff --git a/cmake/ncnn_generate_shader_spv_header.cmake b/cmake/ncnn_generate_shader_spv_header.cmake
deleted file mode 100644
index 93649daed92..00000000000
--- a/cmake/ncnn_generate_shader_spv_header.cmake
+++ /dev/null
@@ -1,581 +0,0 @@
-
-function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS SHADER_SRC)
-
- # fp32
- get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
-
- set(SHADER_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.hex.h)
- add_custom_command(
- OUTPUT ${SHADER_SPV_HEX_FILE}
- COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
- ARGS -Dsfp=float -Dsfpvec2=vec2 -Dsfpvec4=vec4 -Dsfpvec8=mat2x4 -Dsfpmat4=mat4
- -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4
- "-D buffer_ld1(buf,i)=buf[i]"
- "-D buffer_st1(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
- "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
- "-D buffer_ld2(buf,i)=buf[i]"
- "-D buffer_st2(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_ld4(buf,i)=buf[i]"
- "-D buffer_st4(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp4to1(buf,i4,sbuf,si)={vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"
- "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
- "-D buffer_ld8(buf,i)=buf[i]"
- "-D buffer_st8(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
- "-D buffer_cp8to4(buf,i2,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
- "-D sfp2afpmat4(v)=v"
- "-D afp2sfpmat4(v)=v"
- "-D psc(x)=(x==0?p.x:x)"
- -V -s -x -o ${SHADER_SPV_HEX_FILE} ${SHADER_SRC}
- DEPENDS ${SHADER_SRC}
- COMMENT "Building SPIR-V module ${SHADER_SRC_NAME_WE}.spv"
- VERBATIM
- )
- set_source_files_properties(${SHADER_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
- # fp16 packed
- set(SHADER_fp16p_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16p")
-
- set(SHADER_fp16p_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16p_SRC_NAME_WE}.spv.hex.h)
- add_custom_command(
- OUTPUT ${SHADER_fp16p_SPV_HEX_FILE}
- COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
- ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
- -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4
- "-D buffer_ld1(buf,i)=buf[i]"
- "-D buffer_st1(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"
- "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"
- "-D buffer_ld2(buf,i)=unpackHalf2x16(buf[i])"
- "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(v)}"
- "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_ld4(buf,i)=vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"
- "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"
- "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
- "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
- "-D buffer_ld8(buf,i)=mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"
- "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"
- "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
- "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
- "-D psc(x)=(x==0?p.x:x)"
- -DNCNN_fp16_packed=1
- -V -s -x -o ${SHADER_fp16p_SPV_HEX_FILE} ${SHADER_SRC}
- DEPENDS ${SHADER_SRC}
- COMMENT "Building SPIR-V module ${SHADER_fp16p_SRC_NAME_WE}.spv"
- VERBATIM
- )
- set_source_files_properties(${SHADER_fp16p_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
- # fp16 packed + fp16 arithmetic
- set(SHADER_fp16pa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16pa")
-
- set(SHADER_fp16pa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16pa_SRC_NAME_WE}.spv.hex.h)
- add_custom_command(
- OUTPUT ${SHADER_fp16pa_SPV_HEX_FILE}
- COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
- ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
- -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
- "-D buffer_ld1(buf,i)=float16_t(buf[i])"
- "-D buffer_st1(buf,i,v)={buf[i]=float(v);}"
- "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"
- "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"
- "-D buffer_ld2(buf,i)=f16vec2(unpackHalf2x16(buf[i]))"
- "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(vec2(v))}"
- "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_ld4(buf,i)=f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"
- "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"
- "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
- "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
- "-D buffer_ld8(buf,i)=f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"
- "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"
- "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
- "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
- "-D psc(x)=(x==0?p.x:x)"
- -DNCNN_fp16_packed=1 -DNCNN_fp16_arithmetic=1
- -V -s -x -o ${SHADER_fp16pa_SPV_HEX_FILE} ${SHADER_SRC}
- DEPENDS ${SHADER_SRC}
- COMMENT "Building SPIR-V module ${SHADER_fp16pa_SRC_NAME_WE}.spv"
- VERBATIM
- )
- set_source_files_properties(${SHADER_fp16pa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
- # fp16 storage
- set(SHADER_fp16s_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16s")
-
- set(SHADER_fp16s_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16s_SRC_NAME_WE}.spv.hex.h)
- add_custom_command(
- OUTPUT ${SHADER_fp16s_SPV_HEX_FILE}
- COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
- ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4
- -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4
- "-D buffer_ld1(buf,i)=float(buf[i])"
- "-D buffer_st1(buf,i,v)={buf[i]=float16_t(v);}"
- "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"
- "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"
- "-D buffer_ld2(buf,i)=vec2(buf[i])"
- "-D buffer_st2(buf,i,v)={buf[i]=f16vec2(v);}"
- "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_ld4(buf,i)=vec4(buf[i])"
- "-D buffer_st4(buf,i,v)={buf[i]=f16vec4(v);}"
- "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
- "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"
- "-D buffer_ld8(buf,i)=mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"
- "-D buffer_st8(buf,i,v)={buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"
- "-D buffer_cp8(buf,i,sbuf,si)={buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"
- "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"
- "-D buffer_cp8to4(buf,i2,sbuf,si)={buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"
- "-D psc(x)=(x==0?p.x:x)"
- -DNCNN_fp16_storage=1
- -V -s -x -o ${SHADER_fp16s_SPV_HEX_FILE} ${SHADER_SRC}
- DEPENDS ${SHADER_SRC}
- COMMENT "Building SPIR-V module ${SHADER_fp16s_SRC_NAME_WE}.spv"
- VERBATIM
- )
- set_source_files_properties(${SHADER_fp16s_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
- # fp16 storage + fp16 arithmetic
- set(SHADER_fp16sa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16sa")
-
- set(SHADER_fp16sa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16sa_SRC_NAME_WE}.spv.hex.h)
- add_custom_command(
- OUTPUT ${SHADER_fp16sa_SPV_HEX_FILE}
- COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
- ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpvec8=f16mat2x4 -Dsfpmat4=f16mat4
- -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
- "-D buffer_ld1(buf,i)=buf[i]"
- "-D buffer_st1(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
- "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
- "-D buffer_ld2(buf,i)=buf[i]"
- "-D buffer_st2(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_ld4(buf,i)=buf[i]"
- "-D buffer_st4(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
- "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
- "-D buffer_ld8(buf,i)=buf[i]"
- "-D buffer_st8(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
- "-D buffer_cp8to4(buf,i2,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
- "-D sfp2afpmat4(v)=v"
- "-D afp2sfpmat4(v)=v"
- "-D psc(x)=(x==0?p.x:x)"
- -DNCNN_fp16_storage=1 -DNCNN_fp16_arithmetic=1
- -V -s -x -o ${SHADER_fp16sa_SPV_HEX_FILE} ${SHADER_SRC}
- DEPENDS ${SHADER_SRC}
- COMMENT "Building SPIR-V module ${SHADER_fp16sa_SRC_NAME_WE}.spv"
- VERBATIM
- )
- set_source_files_properties(${SHADER_fp16sa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
- # image + fp32
- set(SHADER_image_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image")
-
- set(SHADER_image_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_SRC_NAME_WE}.spv.hex.h)
- add_custom_command(
- OUTPUT ${SHADER_image_SPV_HEX_FILE}
- COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
- ARGS -Dsfp=float -Dsfpvec2=vec2 -Dsfpvec4=vec4 -Dsfpvec8=mat2x4 -Dsfpmat4=mat4
- -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4
-
- -Dimfmtc1=r32f -Dimfmtc4=rgba32f
- -Dunfp=highp
-
- "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
- "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
- "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
- "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
- "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
- "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
- "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
- "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
- "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
- "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
- "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
- "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
- "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
- "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
- "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
- "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
- "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
- "-D buffer_ld1(buf,i)=buf[i]"
- "-D buffer_st1(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
- "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
- "-D buffer_ld2(buf,i)=buf[i]"
- "-D buffer_st2(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_ld4(buf,i)=buf[i]"
- "-D buffer_st4(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp4to1(buf,i4,sbuf,si)={vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"
- "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
- "-D buffer_ld8(buf,i)=buf[i]"
- "-D buffer_st8(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
- "-D buffer_cp8to4(buf,i2,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
-
- "-D sfp2afpmat4(v)=v"
- "-D afp2sfpmat4(v)=v"
- "-D psc(x)=(x==0?p.x:x)"
- -DNCNN_image_shader=1
- -V -s -x -o ${SHADER_image_SPV_HEX_FILE} ${SHADER_SRC}
- DEPENDS ${SHADER_SRC}
- COMMENT "Building SPIR-V module ${SHADER_image_SRC_NAME_WE}.spv"
- VERBATIM
- )
- set_source_files_properties(${SHADER_image_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
- # image + fp16p
- set(SHADER_image_fp16p_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16p")
-
- set(SHADER_image_fp16p_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h)
- add_custom_command(
- OUTPUT ${SHADER_image_fp16p_SPV_HEX_FILE}
- COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
- ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
- -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4
-
- -Dimfmtc1=r32f -Dimfmtc4=rgba16f
- -Dunfp=mediump
-
- "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
- "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
- "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
- "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
- "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
- "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
- "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
- "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
- "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
- "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
- "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
- "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
- "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
- "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
- "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
- "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
- "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
- "-D buffer_ld1(buf,i)=buf[i]"
- "-D buffer_st1(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"
- "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"
- "-D buffer_ld2(buf,i)=unpackHalf2x16(buf[i])"
- "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(v)}"
- "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_ld4(buf,i)=vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"
- "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"
- "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
- "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
- "-D buffer_ld8(buf,i)=mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"
- "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"
- "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
- "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
-
- "-D psc(x)=(x==0?p.x:x)"
- -DNCNN_image_shader=1 -DNCNN_fp16_packed=1
- -V -s -x -o ${SHADER_image_fp16p_SPV_HEX_FILE} ${SHADER_SRC}
- DEPENDS ${SHADER_SRC}
- COMMENT "Building SPIR-V module ${SHADER_image_fp16p_SRC_NAME_WE}.spv"
- VERBATIM
- )
- set_source_files_properties(${SHADER_image_fp16p_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
- # image + fp16p + fp16a
- set(SHADER_image_fp16pa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16pa")
-
- set(SHADER_image_fp16pa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16pa_SRC_NAME_WE}.spv.hex.h)
- add_custom_command(
- OUTPUT ${SHADER_image_fp16pa_SPV_HEX_FILE}
- COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
- ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
- -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
-
- -Dimfmtc1=r32f -Dimfmtc4=rgba16f
- -Dunfp=mediump
-
- "-D image1d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
- "-D image2d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
- "-D image3d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
- "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
- "-D image1d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
- "-D image2d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
- "-D image3d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
- "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
- "-D image1d_ld8(tex,p)=f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
- "-D image2d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
- "-D image3d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
- "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
- "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
- "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
- "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
- "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
- "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
- "-D buffer_ld1(buf,i)=float16_t(buf[i])"
- "-D buffer_st1(buf,i,v)={buf[i]=float(v);}"
- "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"
- "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"
- "-D buffer_ld2(buf,i)=f16vec2(unpackHalf2x16(buf[i]))"
- "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(vec2(v))}"
- "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_ld4(buf,i)=f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"
- "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"
- "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
- "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
- "-D buffer_ld8(buf,i)=f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"
- "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"
- "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
- "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
-
- "-D psc(x)=(x==0?p.x:x)"
- -DNCNN_image_shader=1 -DNCNN_fp16_packed=1 -DNCNN_fp16_arithmetic=1
- -V -s -x -o ${SHADER_image_fp16pa_SPV_HEX_FILE} ${SHADER_SRC}
- DEPENDS ${SHADER_SRC}
- COMMENT "Building SPIR-V module ${SHADER_image_fp16pa_SRC_NAME_WE}.spv"
- VERBATIM
- )
- set_source_files_properties(${SHADER_image_fp16pa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
- # image + fp16s
- set(SHADER_image_fp16s_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16s")
-
- set(SHADER_image_fp16s_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h)
- add_custom_command(
- OUTPUT ${SHADER_image_fp16s_SPV_HEX_FILE}
- COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
- ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4
- -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4
-
- -Dimfmtc1=r16f -Dimfmtc4=rgba16f
- -Dunfp=mediump
-
- "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
- "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
- "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
- "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
- "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
- "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
- "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
- "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
- "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
- "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
- "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
- "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
- "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
- "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
- "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
- "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
- "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
- "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
- "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
- "-D buffer_ld1(buf,i)=float(buf[i])"
- "-D buffer_st1(buf,i,v)={buf[i]=float16_t(v);}"
- "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"
- "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"
- "-D buffer_ld2(buf,i)=vec2(buf[i])"
- "-D buffer_st2(buf,i,v)={buf[i]=f16vec2(v);}"
- "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_ld4(buf,i)=vec4(buf[i])"
- "-D buffer_st4(buf,i,v)={buf[i]=f16vec4(v);}"
- "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
- "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"
- "-D buffer_ld8(buf,i)=mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"
- "-D buffer_st8(buf,i,v)={buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"
- "-D buffer_cp8(buf,i,sbuf,si)={buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"
- "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"
- "-D buffer_cp8to4(buf,i2,sbuf,si)={buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"
-
- "-D sfp2afpmat4(v)=v"
- "-D afp2sfpmat4(v)=v"
- "-D psc(x)=(x==0?p.x:x)"
- -DNCNN_image_shader=1 -DNCNN_fp16_storage=1
- -V -s -x -o ${SHADER_image_fp16s_SPV_HEX_FILE} ${SHADER_SRC}
- DEPENDS ${SHADER_SRC}
- COMMENT "Building SPIR-V module ${SHADER_image_fp16s_SRC_NAME_WE}.spv"
- VERBATIM
- )
- set_source_files_properties(${SHADER_image_fp16s_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
- # image + fp16s + fp16a
- set(SHADER_image_fp16sa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16sa")
-
- set(SHADER_image_fp16sa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16sa_SRC_NAME_WE}.spv.hex.h)
- add_custom_command(
- OUTPUT ${SHADER_image_fp16sa_SPV_HEX_FILE}
- COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
- ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpvec8=f16mat2x4 -Dsfpmat4=f16mat4
- -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
-
- -Dimfmtc1=r16f -Dimfmtc4=rgba16f
- -Dunfp=mediump
-
- "-D image1d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
- "-D image2d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
- "-D image3d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
- "-D image1d_st1(img,p,v)={f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"
- "-D image2d_st1(img,p,v)={f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"
- "-D image3d_st1(img,p,v)={f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"
- "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
- "-D image1d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
- "-D image2d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
- "-D image3d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
- "-D image1d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
- "-D image2d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
- "-D image3d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
- "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
- "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
- "-D image1d_ld8(tex,p)=f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
- "-D image2d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
- "-D image3d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
- "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,vec4(v[0]));imageStore(img,(p)*2+1,vec4(v[1]));}"
- "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}"
- "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}"
- "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
- "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
- "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
- "-D buffer_ld1(buf,i)=buf[i]"
- "-D buffer_st1(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
- "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
- "-D buffer_ld2(buf,i)=buf[i]"
- "-D buffer_st2(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_ld4(buf,i)=buf[i]"
- "-D buffer_st4(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
- "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
- "-D buffer_ld8(buf,i)=buf[i]"
- "-D buffer_st8(buf,i,v)={buf[i]=v;}"
- "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
- "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
- "-D buffer_cp8to4(buf,i2,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
- "-D sfp2afpmat4(v)=v"
- "-D afp2sfpmat4(v)=v"
-
- "-D psc(x)=(x==0?p.x:x)"
- -DNCNN_image_shader=1 -DNCNN_fp16_storage=1 -DNCNN_fp16_arithmetic=1
- -V -s -x -o ${SHADER_image_fp16sa_SPV_HEX_FILE} ${SHADER_SRC}
- DEPENDS ${SHADER_SRC}
- COMMENT "Building SPIR-V module ${SHADER_image_fp16sa_SRC_NAME_WE}.spv"
- VERBATIM
- )
- set_source_files_properties(${SHADER_image_fp16sa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
- set(LOCAL_SHADER_SPV_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.h)
-
- file(WRITE ${LOCAL_SHADER_SPV_HEADER}
- "static const uint32_t ${SHADER_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_SRC_NAME_WE}.spv.hex.h\"\n};\n"
- "static const uint32_t ${SHADER_fp16p_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16p_SRC_NAME_WE}.spv.hex.h\"\n};\n"
- "static const uint32_t ${SHADER_fp16pa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16pa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
- "static const uint32_t ${SHADER_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n"
- "static const uint32_t ${SHADER_fp16sa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16sa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
- "static const uint32_t ${SHADER_image_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_SRC_NAME_WE}.spv.hex.h\"\n};\n"
- "static const uint32_t ${SHADER_image_fp16p_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h\"\n};\n"
- "static const uint32_t ${SHADER_image_fp16pa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16pa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
- "static const uint32_t ${SHADER_image_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n"
- "static const uint32_t ${SHADER_image_fp16sa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16sa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
- )
-
- set_source_files_properties(${LOCAL_SHADER_SPV_HEADER} PROPERTIES GENERATED TRUE)
-
- set(LOCAL_SHADER_SPV_HEX_HEADERS
- ${SHADER_SPV_HEX_FILE}
- ${SHADER_fp16p_SPV_HEX_FILE}
- ${SHADER_fp16pa_SPV_HEX_FILE}
- ${SHADER_fp16s_SPV_HEX_FILE}
- ${SHADER_fp16sa_SPV_HEX_FILE}
- ${SHADER_image_SPV_HEX_FILE}
- ${SHADER_image_fp16p_SPV_HEX_FILE}
- ${SHADER_image_fp16pa_SPV_HEX_FILE}
- ${SHADER_image_fp16s_SPV_HEX_FILE}
- ${SHADER_image_fp16sa_SPV_HEX_FILE}
- )
-
- set(${SHADER_SPV_HEADER} ${LOCAL_SHADER_SPV_HEADER} PARENT_SCOPE)
- set(${SHADER_SPV_HEX_HEADERS} ${LOCAL_SHADER_SPV_HEX_HEADERS} PARENT_SCOPE)
-
-endfunction()
diff --git a/codeformat.sh b/codeformat.sh
index 3e9cb33832a..21d128f4698 100755
--- a/codeformat.sh
+++ b/codeformat.sh
@@ -3,9 +3,9 @@
# we run clang-format and astyle twice to get stable format output
format_code() {
- find src/ tools/ tests/ examples/ benchmark/ python/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.cc' -o -name '*.h' | grep -v python/pybind11 | grep -v stb_image | xargs -i clang-format -i {}
+ find src/ tools/ tests/ examples/ benchmark/ python/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.cc' -o -name '*.h' | grep -v python/pybind11 | grep -v stb_image | grep -v ruapu | xargs -i clang-format -i {}
astyle -n -r "benchmark/*.h,*.cpp,*.cc" "tests/*.h,*.cpp,*.cc" "tools/*.h,*.cpp,*.cc" "examples/*.h,*.cpp,*.cc"
- astyle -n -r "src/*.h,*.cpp,*.cc" --exclude=src/stb_image.h --exclude=src/stb_image_write.h
+ astyle -n -r "src/*.h,*.cpp,*.cc" --exclude=src/stb_image.h --exclude=src/stb_image_write.h --exclude=src/ruapu.h
astyle -n -r "python/*.h,*.cpp,*.cc" --exclude=python/pybind11
}
diff --git a/docs/Home.md b/docs/Home.md
index f1108b7b8ef..7f377e1b1f7 100644
--- a/docs/Home.md
+++ b/docs/Home.md
@@ -21,8 +21,6 @@ int main()
net.load_model("model.bin");
ncnn::Extractor ex = net.create_extractor();
- ex.set_light_mode(true);
- ex.set_num_threads(4);
ex.input("data", in);
diff --git a/docs/developer-guide/arm-a53-a55-dual-issue.md b/docs/developer-guide/arm-a53-a55-dual-issue.md
index 7344747a8c0..ace5e7092a4 100644
--- a/docs/developer-guide/arm-a53-a55-dual-issue.md
+++ b/docs/developer-guide/arm-a53-a55-dual-issue.md
@@ -51,20 +51,23 @@ fmla
```
## A55
-* 128bit vector load cannot be dual issued with fmla, wait 2 cycles
-* 64bit vector load can be dual issued with fmla, no penalty
+* Limited by the number of neon register read and write ports, most neon instructions cannot be dual-issued.
+* neon instructions have different latencies
+* 128bit vector load cannot be issued with fmla, WAR wait 2 cycles
* 64bit integer load can be dual issued with fmla, no penalty
* pointer update can be dual issued with fmla, no penalty
* 64bit vector insert can be dual issued with fmla, no penalty
### practical guide
-* use 64bit vector load only
-* load 64bit, dual issue with fmla
+* A55 supports 128bit load and 256bit write in one clock. Support dual emission of two 64bit vector loads or single emission of 128bit vector load
+* `ldr`, dual issue with fmla
* load the remaining 64bit into integer register, dual issue with fmla
* update pointer, dual issue with fmla
* insert 64bit into vector from integer register, dual issue with fmla
* interleaved load loose register dependency
* nop trick is not needed
+* Loop unrolling fma reduces pipeline bubbles
+* Some data type conversion neon instructions can be dual issued, such as `fsvts`
```
ldr d0, [r0] // 0 cycle, v0 first 64bit
fmla
diff --git a/docs/developer-guide/glsl-extension.md b/docs/developer-guide/glsl-extension.md
index 185ca0e49cb..82ae035e46d 100644
--- a/docs/developer-guide/glsl-extension.md
+++ b/docs/developer-guide/glsl-extension.md
@@ -99,7 +99,7 @@ void main()
)";
Option opt;
- // you can control the extention behavior
+ // you can control the extension behavior
// even if the gpu supports 16bit storage
opt.use_fp16_storage = false;
@@ -170,10 +170,10 @@ declare variable in shared local memory
shared lfp tmp_a[8][4][2];
```
-|local type|fp32|fp16p / fp16s|fp16s + fp16a|
-|---|---|---|---|
-|lfp|float|float|float16_t|
-|lfpvec4|vec4|uvec2|f16vec4|
+|local type|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u|
+|---|---|---|---|---|
+|lfp|float|float|float|float16_t|
+|lfpvec4|vec4|uvec2|uint64_t|f16vec4|
## image format and precision hint type
diff --git a/docs/developer-guide/glsl-extension.zh.md b/docs/developer-guide/glsl-extension.zh.md
index 9b0718adec5..1e856929ac3 100644
--- a/docs/developer-guide/glsl-extension.zh.md
+++ b/docs/developer-guide/glsl-extension.zh.md
@@ -170,10 +170,10 @@ void main()
shared lfp tmp_a[8][4][2];
```
-|local type|fp32|fp16p / fp16s|fp16s + fp16a|
-|---|---|---|---|
-|lfp|float|float|float16_t|
-|lfpvec4|vec4|uvec2|f16vec4|
+|local type|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u|
+|---|---|---|---|---|
+|lfp|float|float|float|float16_t|
+|lfpvec4|vec4|uvec2|uint64_t|f16vec4|
## 图像格式类型(image format type)和精度类型(precision hint type)
diff --git a/docs/developer-guide/layer-feat-mask.md b/docs/developer-guide/layer-feat-mask.md
new file mode 100644
index 00000000000..caff65c2144
--- /dev/null
+++ b/docs/developer-guide/layer-feat-mask.md
@@ -0,0 +1,111 @@
+# layer feature mask
+
+Each ncnn layer allows a special parameter pair `31=X` to control specific bahavior.
+
+X is an unsigned integer with each bit contributing a feature mask.
+
+We usually use it to configuring fine-graded behaviors for certain layers to maintain accuracy, reduce memory usage or optimize performance.
+
+|bit|value|mask|rationale|
+|---|---|---|---|
+|1<<0|1|no fp16 arithmetic|precision concern|
+|1<<1|2|no fp16 storage|precision concern|
+|1<<2|4|no bf16 storage|precision concern|
+|1<<3|8|no int8|debug dynamic quantized model|
+|1<<4|16|no vulkan|reduce overhead for cpu op - gpu split - cpu op|
+|1<<5|32|no sgemm|reduce some memory|
+|1<<6|64|no winograd|reduce some memory|
+|1<<7|128|no threading|force single thread|
+
+These bits can be OR-combined into one value to control multiple behaviors simultaneously.
+
+For example, `31=17` means disabling both vulkan and fp16 arithmetic.
+
+## disable fp16 for certain layer to fix overflow
+
+```ruby
+7767517
+3 3
+Input input 0 1 input0 0=22 1=22 2=32
+Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
+Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1
+```
+
+Typically, we use fp16 computation to improve inference speed.
+However, since the weight value of `conv1` is very large, fp16 accumulation may cause numerical overflow, so fp16 needs to be disabled individually for `conv1`, while other layers continue to use fp16 mode
+
+Add `31=3` to disable fp16 storage and arithmetic.
+
+```ruby
+7767517
+3 3
+Input input 0 1 input0 0=22 1=22 2=32
+Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
+Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1 31=3
+```
+
+## disable vulkan for certain layer to improve performance
+
+```ruby
+7767517
+5 5
+Input input 0 1 input0 0=22 1=22 2=32
+Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
+SomeCPULayer c0 1 1 conv0 c0 0=32
+ReLU relu0 1 1 c0 relu0
+SomeCPULayer c1 1 1 relu0 c1 0=32
+```
+
+Between the CPU layers, there is a simple calculation layer that supports vulkan. We can set `31=16` to force it to run on CPU. This can avoid the overhead of data upload, download and storage layout conversion between CPU and GPU. After all, CPU is fast enough for simple operations.
+
+```ruby
+7767517
+5 5
+Input input 0 1 input0 0=22 1=22 2=32
+Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
+SomeCPULayer c0 1 1 conv0 c0 0=32
+ReLU relu0 1 1 c0 relu0 31=16
+SomeCPULayer c1 1 1 relu0 c1 0=32
+```
+
+## disable winograd for certain layer to reduce memory usage
+
+```ruby
+7767517
+3 3
+Input input 0 1 input0 0=22 1=22 2=32
+Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
+Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1
+```
+
+The winograd technology uses more memory for the purpose of improving convolution performance, but this is not always true. In some memory-constrained situations, or memory IO bottlenecks, we can disable the use of winograd on some layers in exchange for a smaller memory footprint. Add `31=64` to Convolution layer, which forces it to use implcit-gemm or tiled im2col-gemm implementation, reducing memory usage and sometimes improving vulkan performance.
+
+```ruby
+7767517
+3 3
+Input input 0 1 input0 0=22 1=22 2=32
+Convolution conv0 1 1 input0 conv0 0=32 1=1 6=1024 9=1
+Convolution conv1 1 1 conv0 conv1 0=128 1=3 6=36864 9=1 31=64
+```
+
+## disable threading for certain layer to improve performance
+
+```ruby
+7767517
+4 4
+Input input 0 1 input0 0=22 1=22 2=3
+Convolution conv0 1 1 input0 conv0 0=16 1=3 6=432
+HardSigmoid hs 1 1 conv0 hs0
+Convolution conv1 1 1 hs0 conv1 0=16 1=3 6=2304
+```
+
+The overhead of multi-thread dispatch and merging is too large for small tensors. Add `31=128` to HardSigmoid layer, which forces it to execute in a single thread, reducing power consumption and improving performance.
+
+```ruby
+7767517
+4 4
+Input input 0 1 input0 0=22 1=22 2=3
+Convolution conv0 1 1 input0 conv0 0=16 1=3 6=432
+HardSigmoid hs 1 1 conv0 hs0 31=128
+Convolution conv1 1 1 hs0 conv1 0=16 1=3 6=2304
+```
diff --git a/docs/faq.en.md b/docs/faq.en.md
index 071809808fe..807c4a9e3ee 100644
--- a/docs/faq.en.md
+++ b/docs/faq.en.md
@@ -262,7 +262,7 @@ Fully customizable op, first change to one that can export (e.g. concat slice),
Set net.opt.use_vulkan_compute = true before load_param / load_model;
-- ## How to exexcite multiple blob inputs, multiple blob outputs?
+- ## How to ececute multiple blob inputs, multiple blob outputs?
Multiple execute `ex.input()` and `ex.extract()` like following
```
ex.input("data1", in_1);
diff --git a/docs/how-to-build/build-mlir2ncnn.md b/docs/how-to-build/build-mlir2ncnn.md
index f975824c7cc..521f20cff82 100644
--- a/docs/how-to-build/build-mlir2ncnn.md
+++ b/docs/how-to-build/build-mlir2ncnn.md
@@ -8,7 +8,7 @@ https://github.com/llvm/llvm-project.git
git checkout -b mlir
```
Current working commit id is 74e6030bcbcc8e628f9a99a424342a0c656456f9:
-```
+```bash
$ git log
commit 74e6030bcbcc8e628f9a99a424342a0c656456f9 (HEAD -> main, origin/main, origin/HEAD)
@@ -49,6 +49,6 @@ See https://zhuanlan.zhihu.com/p/152535430
**Usage mlir2ncnn**
-```
+```bash
./mlir2ncnn pix2pix.mlir pix2pix.param pix2pix.bin
```
diff --git a/docs/how-to-build/how-to-build.md b/docs/how-to-build/how-to-build.md
index e7cbf472726..c15e1d9485b 100644
--- a/docs/how-to-build/how-to-build.md
+++ b/docs/how-to-build/how-to-build.md
@@ -180,7 +180,7 @@ cmake --build . --config Release --target install
```
(optional) Download and install Vulkan SDK from https://vulkan.lunarg.com/sdk/home
-Build ncnn library (replace with a proper path):
+Build ncnn library (replace `` with a proper path):
```shell
cd
@@ -193,6 +193,43 @@ cmake --build . --config Release --target install
Note: To speed up compilation process on multi core machines, configuring `cmake` to use `jom` or `ninja` using `-G` flag is recommended.
+Note: For protobuf >=22.0 (Take v25.3 for example):
+
+Build zlib:
+```shell
+git clone -b -v1.3.1 https://github.com/madler/zlib.git
+cd zlib
+mkdir build
+cd build
+cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install ..
+cmake --build . --config Release -j 2
+cmake --build . --config Release --target install
+```
+
+Build protobuf library (replace `` with a proper path):
+```shell
+git clone -b v25.3 https://github.com/protocolbuffers/protobuf.git
+cd protobuf
+git submodule update --init --recursive
+
+mkdir protobuf_build
+cd protobuf_build
+cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -DCMAKE_CXX_STANDARD=14 -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DZLIB_INCLUDE_DIR=\build\install\include -DZLIB_LIBRARY=\build\install\lib\zlib.lib -DABSL_PROPAGATE_CXX_STD=ON ../cmake
+cmake --build . --config Release -j 2
+cmake --build . --config Release --target install
+```
+
+Build ncnn library (replace `` and `` with a proper path):
+
+```shell
+cd
+mkdir -p build
+cd build
+cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -DCMAKE_PREFIX_PATH=/protobuf_build\install\cmake -DZLIB_INCLUDE_DIR=\build\install\include -DZLIB_LIBRARY=\build\install\lib\zlib.lib -Dabsl_DIR=/protobuf_build\install\lib\cmake\absl -Dutf8_range_DIR=/protobuf_build\install\lib\cmake\utf8_range -DNCNN_VULKAN=ON ..
+cmake --build . --config Release -j 2
+cmake --build . --config Release --target install
+```
+
***
### Build for macOS
@@ -215,13 +252,13 @@ Download and install Vulkan SDK from
```shell
-wget https://sdk.lunarg.com/sdk/download/1.2.189.0/mac/vulkansdk-macos-1.2.189.0.dmg?Human=true -O vulkansdk-macos-1.2.189.0.dmg
-hdiutil attach vulkansdk-macos-1.2.189.0.dmg
-sudo /Volumes/vulkansdk-macos-1.2.189.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root `pwd`/vulkansdk-macos-1.2.189.0 --accept-licenses --default-answer --confirm-command install
-hdiutil detach /Volumes/vulkansdk-macos-1.2.189.0
+wget https://sdk.lunarg.com/sdk/download/1.3.280.1/mac/vulkansdk-macos-1.3.280.1.dmg -O vulkansdk-macos-1.3.280.1.dmg
+hdiutil attach vulkansdk-macos-1.3.280.1.dmg
+sudo /Volumes/vulkansdk-macos-1.3.280.1/InstallVulkan.app/Contents/MacOS/InstallVulkan --root `pwd`/vulkansdk-macos-1.3.280.1 --accept-licenses --default-answer --confirm-command install
+hdiutil detach /Volumes/vulkansdk-macos-1.3.280.1
# setup env
-export VULKAN_SDK=`pwd`/vulkansdk-macos-1.2.189.0/macOS
+export VULKAN_SDK=`pwd`/vulkansdk-macos-1.3.280.1/macOS
```
```shell
@@ -229,9 +266,8 @@ cd
mkdir -p build
cd build
-cmake -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \
- -DVulkan_INCLUDE_DIR=`pwd`/../vulkansdk-macos-1.2.189.0/MoltenVK/include \
- -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.2.189.0/MoltenVK/dylib/macOS/libMoltenVK.dylib \
+cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=MAC -DARCHS="x86_64;arm64" \
+ -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.3.280.1/macOS/lib/libMoltenVK.dylib \
-DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=ON ..
cmake --build . -j 4
@@ -330,12 +366,7 @@ cd build-android-armv7
cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \
-DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON \
- -DANDROID_PLATFORM=android-14 ..
-
-# If you want to enable Vulkan, platform api version >= android-24 is needed
-cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \
- -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON \
- -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+ -DANDROID_PLATFORM=android-14 -DNCNN_VULKAN=ON ..
# If you use cmake >= 3.21 and ndk-r23
# you need to add -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False option for working optimization flags
@@ -356,12 +387,7 @@ cd build-android-aarch64
cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake"\
-DANDROID_ABI="arm64-v8a" \
- -DANDROID_PLATFORM=android-21 ..
-
-# If you want to enable Vulkan, platform api version >= android-24 is needed
-cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \
- -DANDROID_ABI="arm64-v8a" \
- -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+ -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
# If you use cmake >= 3.21 and ndk-r23
# you need to add -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False option for working optimization flags
@@ -395,7 +421,7 @@ mkdir -p build-ios
cd build-ios
cmake -DCMAKE_TOOLCHAIN_FILE=/toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \
- -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7;arm64;arm64e" \
+ -DPLATFORM=OS64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DARCHS="arm64;arm64e" \
-DPERL_EXECUTABLE=/usr/local/bin/perl \
-DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF ..
@@ -422,7 +448,7 @@ mkdir -p build-ios-sim
cd build-ios-sim
cmake -DCMAKE_TOOLCHAIN_FILE=/toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \
- -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \
+ -DPLATFORM=SIMULATORARM64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DARCHS="x86_64;arm64" \
-DPERL_EXECUTABLE=/usr/local/bin/perl \
-DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF ..
@@ -469,21 +495,11 @@ git submodule update --init
mkdir -p build-ios
cd build-ios
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS -DIOS_ARCH="armv7;arm64;arm64e" \
- -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \
- -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
- -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
- -DOpenMP_libomp_LIBRARY="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \
- -DNCNN_BUILD_BENCHMARK=OFF ..
-
-# vulkan is only available on arm64 devices
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DIOS_ARCH="arm64;arm64e" \
+cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=OS64 -DARCHS="arm64;arm64e" \
-DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \
-DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-DOpenMP_libomp_LIBRARY="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \
- -DVulkan_INCLUDE_DIR=$VULKAN_SDK/../MoltenVK/include \
- -DVulkan_LIBRARY=$VULKAN_SDK/../MoltenVK/dylib/iOS/libMoltenVK.dylib \
-DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
cmake --build . -j 4
@@ -497,7 +513,7 @@ cd
mkdir -p build-ios-sim
cd build-ios-sim
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=SIMULATOR -DIOS_ARCH="i386;x86_64" \
+cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=SIMULATORARM64 -DARCHS="x86_64;arm64" \
-DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \
-DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
@@ -508,7 +524,7 @@ cmake --build . -j 4
cmake --build . --target install
```
-Package glslang framework:
+Package glslang framework for iPhoneOS:
```shell
cd
@@ -519,13 +535,12 @@ ln -s Versions/Current/Headers glslang.framework/Headers
ln -s Versions/Current/Resources glslang.framework/Resources
ln -s Versions/Current/glslang glslang.framework/glslang
libtool -static build-ios/install/lib/libglslang.a build-ios/install/lib/libMachineIndependent.a build-ios/install/lib/libGenericCodeGen.a build-ios/install/lib/libSPIRV.a build-ios/install/lib/libOGLCompiler.a build-ios/install/lib/libOSDependent.a -o build-ios/install/lib/libglslang_combined.a
-libtool -static build-ios-sim/install/lib/libglslang.a build-ios-sim/install/lib/libMachineIndependent.a build-ios-sim/install/lib/libGenericCodeGen.a build-ios-sim/install/lib/libSPIRV.a build-ios-sim/install/lib/libOGLCompiler.a build-ios-sim/install/lib/libOSDependent.a -o build-ios-sim/install/lib/libglslang_combined.a
-lipo -create build-ios/install/lib/libglslang_combined.a build-ios-sim/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang
+lipo -create build-ios/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang
cp -r build/install/include/glslang glslang.framework/Versions/A/Headers/
sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
```
-Package ncnn framework:
+Package ncnn framework for iPhoneOS:
```shell
cd
@@ -535,7 +550,7 @@ ln -s A ncnn.framework/Versions/Current
ln -s Versions/Current/Headers ncnn.framework/Headers
ln -s Versions/Current/Resources ncnn.framework/Resources
ln -s Versions/Current/ncnn ncnn.framework/ncnn
-lipo -create build-ios/install/lib/libncnn.a build-ios-sim/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn
+lipo -create build-ios/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn
cp -r build-ios/install/include/* ncnn.framework/Versions/A/Headers/
sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
```
@@ -753,7 +768,7 @@ Pick `build-qnx/install` folder for further usage.
Install DevkitPRO toolchains
- If you are working on windows, download DevkitPro installer from [DevkitPro](https://devkitpro.org/wiki/Getting_Started).
- If you are using Ubuntu, the official guidelines from DevkitPro might not work for you. Try using the lines below to install
-```
+```shell
sudo apt-get update
sudo apt-get upgrade
wget https://apt.devkitpro.org/install-devkitpro-pacman
@@ -761,14 +776,14 @@ chmod +x ./install-devkitpro-pacman
sudo ./install-devkitpro-pacman
```
-```
+```shell
export DEVKITPRO=/opt/devkitpro
export DEVKITARM=/opt/devkitpro/devkitARM
export DEVKITPPC=/opt/devkitpro/devkitPPC
export export PATH=$/opt/devkitpro/tools/bin:$PATH
source ~/.profile
```
-```
+```shell
sudo dkp-pacman -Sy
sudo dkp-pacman -Syu
sudo dkp-pacman -S 3ds-dev
@@ -796,7 +811,7 @@ Copy the toolchain files from [3DS-cmake](https://github.com/Xtansia/3ds-cmake)(
```
Build with:
-```
+```shell
cd ncnn
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/DevkitArm3DS.cmake .. -DNCNN_SIMPLEOCV=ON -DNCNN_OPENMP=OFF -DNCNN_VFPV4=OFF ..
diff --git a/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md b/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md
index 0c17a306738..60d9f2c639a 100644
--- a/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md
+++ b/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md
@@ -39,7 +39,7 @@ Most of these systems are android with version lower than 8.1.
In the beginning, I had no GPGPU programming experience, and I had to learn one.
-vulkan is considered more portable and well supported by venders and the cross-platform low-overhead graphics api. As a contrast, cuda is only available on nvidia device, metal is only available on macos and ios, while loading opencl library is banned in android 7.0+ and does not work on ios.
+vulkan is considered more portable and well supported by vendors and the cross-platform low-overhead graphics api. As a contrast, cuda is only available on nvidia device, metal is only available on macos and ios, while loading opencl library is banned in android 7.0+ and does not work on ios.
### I got errors like "vkCreateComputePipelines failed -1000012000" or random stalls or crashes
@@ -87,7 +87,7 @@ It is common that your model runs slower on gpu than cpu on arm devices like mob
### vulkan device not found / extra high cpu utility while vulkan is enabled on nvidia gpu
-There are severel reasons could lead to this outcome. First please check your driver status with `nvidia-smi`. If you have correctly installed your driver, you should see something like this:
+There are several reasons could lead to this outcome. First please check your driver status with `nvidia-smi`. If you have correctly installed your driver, you should see something like this:
```bash
$ nvidia-smi
diff --git a/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md b/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md
index c23b050ba27..29b2a0fc586 100644
--- a/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md
+++ b/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md
@@ -103,7 +103,6 @@ Execute the network inference and retrieve the result
ncnn::Mat in;// input blob as above
ncnn::Mat out;
ncnn::Extractor ex = net.create_extractor();
-ex.set_light_mode(true);
ex.input("data", in);
ex.extract("prob", out);
```
@@ -114,7 +113,6 @@ If you load model with binary param.bin file, you should use the enum value in a
ncnn::Mat in;// input blob as above
ncnn::Mat out;
ncnn::Extractor ex = net.create_extractor();
-ex.set_light_mode(true);
ex.input(alexnet_param_id::BLOB_data, in);
ex.extract(alexnet_param_id::BLOB_prob, out);
```
@@ -131,10 +129,6 @@ for (int j=0; j // __system_property_get()
#include
#endif
+#include
#include
+#include
+#include
#include
#include
#endif
@@ -69,6 +72,7 @@
#include
#include
#include
+#include
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#define __IOS__ 1
@@ -115,6 +119,9 @@
#include
#endif
+#define RUAPU_IMPLEMENTATION
+#include "ruapu.h"
+
// topology info
static int g_cpucount;
static int g_physical_cpucount;
@@ -125,9 +132,6 @@ static ncnn::CpuSet g_cpu_affinity_mask_big;
// isa info
#if defined _WIN32
-#if __arm__
-static int g_cpu_support_arm_neon;
-static int g_cpu_support_arm_vfpv4;
#if __aarch64__
static int g_cpu_support_arm_asimdhp;
static int g_cpu_support_arm_cpuid;
@@ -140,10 +144,11 @@ static int g_cpu_support_arm_sve2;
static int g_cpu_support_arm_svebf16;
static int g_cpu_support_arm_svei8mm;
static int g_cpu_support_arm_svef32mm;
-#else // __aarch64__
+#elif __arm__
static int g_cpu_support_arm_edsp;
-#endif // __aarch64__
-#endif // __arm__
+static int g_cpu_support_arm_neon;
+static int g_cpu_support_arm_vfpv4;
+#endif // __aarch64__ || __arm__
#elif defined __ANDROID__ || defined __linux__
static unsigned int g_hwcaps;
static unsigned int g_hwcaps2;
@@ -183,171 +188,59 @@ static int g_cpu_is_arm_a53_a55;
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__
-#if defined _WIN32
-static int g_sigill_caught = 0;
-static jmp_buf g_jmpbuf;
-
-static LONG CALLBACK catch_sigill(struct _EXCEPTION_POINTERS* ExceptionInfo)
+static bool is_being_debugged()
{
- if (ExceptionInfo->ExceptionRecord->ExceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION)
- {
- g_sigill_caught = 1;
- longjmp(g_jmpbuf, -1);
- }
+#if defined _WIN32
+ return IsDebuggerPresent();
+#elif defined __ANDROID__ || defined __linux__
+ // https://stackoverflow.com/questions/3596781/how-to-detect-if-the-current-process-is-being-run-by-gdb
+ int status_fd = open("/proc/self/status", O_RDONLY);
+ if (status_fd == -1)
+ return false;
- return EXCEPTION_CONTINUE_SEARCH;
-}
+ char buf[4096];
+ ssize_t num_read = read(status_fd, buf, sizeof(buf) - 1);
+ close(status_fd);
-static int detectisa(const void* some_inst)
-{
- g_sigill_caught = 0;
+ if (num_read <= 0)
+ return false;
- PVOID eh = AddVectoredExceptionHandler(1, catch_sigill);
+ buf[num_read] = '\0';
+ const char tracerPidString[] = "TracerPid:";
+ const char* tracer_pid_ptr = strstr(buf, tracerPidString);
+ if (!tracer_pid_ptr)
+ return false;
- if (setjmp(g_jmpbuf) == 0)
+ for (const char* ch = tracer_pid_ptr + sizeof(tracerPidString) - 1; ch <= buf + num_read; ++ch)
{
- ((void (*)())some_inst)();
- }
-
- RemoveVectoredExceptionHandler(eh);
-
- return g_sigill_caught ? 0 : 1;
-}
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-#ifdef _MSC_VER
-#define DEFINE_INSTCODE(name, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned char name[] = {__VA_ARGS__, 0xc3};
-#else
-#define DEFINE_INSTCODE(name, ...) __attribute__((section(".text"))) static unsigned char name[] = {__VA_ARGS__, 0xc3};
-#endif
-#elif __aarch64__
-#ifdef _MSC_VER
-#define DEFINE_INSTCODE(name, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int name[] = {__VA_ARGS__, 0xd65f03c0};
-#else
-#define DEFINE_INSTCODE(name, ...) __attribute__((section(".text"))) static unsigned int name[] = {__VA_ARGS__, 0xd65f03c0};
-#endif
-#elif __arm__
-#ifdef _MSC_VER
-#define DEFINE_INSTCODE(name, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int name[] = {__VA_ARGS__, 0x4770bf00};
-#else
-#define DEFINE_INSTCODE(name, ...) __attribute__((section(".text"))) static unsigned int name[] = {__VA_ARGS__, 0x4770bf00};
-#endif
-#endif
-
-#elif defined __ANDROID__ || defined __linux__ || defined __APPLE__
-static int g_sigill_caught = 0;
-static sigjmp_buf g_jmpbuf;
-
-static void catch_sigill(int /*signo*/, siginfo_t* /*si*/, void* /*data*/)
-{
- g_sigill_caught = 1;
- siglongjmp(g_jmpbuf, -1);
-}
-
-static int detectisa(void (*some_inst)())
-{
- g_sigill_caught = 0;
-
- struct sigaction sa;
- struct sigaction old_sa;
- memset(&sa, 0, sizeof(sa));
- sa.sa_sigaction = catch_sigill;
- sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO;
- sigaction(SIGILL, &sa, &old_sa);
+ if (isspace(*ch))
+ continue;
- if (sigsetjmp(g_jmpbuf, 1) == 0)
- {
- some_inst();
+ return isdigit(*ch) != 0 && *ch != '0';
}
- sigaction(SIGILL, &old_sa, NULL);
+ return false;
+#elif defined __APPLE__
+ // https://stackoverflow.com/questions/2200277/detecting-debugger-on-mac-os-x
+ struct kinfo_proc info;
+ info.kp_proc.p_flag = 0;
- return g_sigill_caught ? 0 : 1;
-}
+ int mib[4];
+ mib[0] = CTL_KERN;
+ mib[1] = KERN_PROC;
+ mib[2] = KERN_PROC_PID;
+ mib[3] = getpid();
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-#define DEFINE_INSTCODE(name, ...) \
- static void name() \
- { \
- asm volatile(".byte " #__VA_ARGS__ \
- : \
- : \
- :); \
- };
-#elif __aarch64__
-#define DEFINE_INSTCODE(name, ...) \
- static void name() \
- { \
- asm volatile(".word " #__VA_ARGS__ \
- : \
- : \
- :); \
- };
-#elif __arm__
-#define DEFINE_INSTCODE(name, ...) \
- static void name() \
- { \
- asm volatile(".word " #__VA_ARGS__ \
- : \
- : \
- :); \
- };
-#endif
-
-#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
-
-#if defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-DEFINE_INSTCODE(some_mmx, 0x0f, 0xdb, 0xc0) // pand mm0,mm0
-DEFINE_INSTCODE(some_sse, 0x0f, 0x54, 0xc0) // andps xmm0,xmm0
-DEFINE_INSTCODE(some_sse2, 0x66, 0x0f, 0xfe, 0xc0) // paddd xmm0,xmm0
-DEFINE_INSTCODE(some_sse3, 0xf2, 0x0f, 0x7c, 0xc0) // haddps xmm0,xmm0
-DEFINE_INSTCODE(some_ssse3, 0x66, 0x0f, 0x38, 0x06, 0xc0) // phsubd xmm0,xmm0
-DEFINE_INSTCODE(some_sse41, 0x66, 0x0f, 0x38, 0x3d, 0xc0) // pmaxsd xmm0,xmm0
-DEFINE_INSTCODE(some_sse42, 0x66, 0x0f, 0x38, 0x37, 0xc0) // pcmpgtq xmm0,xmm0
-DEFINE_INSTCODE(some_sse4a, 0x66, 0x0f, 0x79, 0xc0) // extrq xmm0,xmm0
-DEFINE_INSTCODE(some_xop, 0x8f, 0xe8, 0x78, 0xb6, 0xc0, 0x00) // vpmadcswd %xmm0,%xmm0,%xmm0,%xmm0
-DEFINE_INSTCODE(some_avx, 0xc5, 0xfc, 0x54, 0xc0) // vandps ymm0,ymm0,ymm0
-DEFINE_INSTCODE(some_f16c, 0xc4, 0xe2, 0x7d, 0x13, 0xc0) // vcvtph2ps ymm0,xmm0
-DEFINE_INSTCODE(some_fma, 0xc4, 0xe2, 0x7d, 0x98, 0xc0) // vfmadd132ps ymm0,ymm0,ymm0
-DEFINE_INSTCODE(some_avx2, 0xc5, 0xfd, 0xfe, 0xc0) // vpaddd ymm0,ymm0,ymm0
-DEFINE_INSTCODE(some_avx512f, 0x62, 0xf1, 0x7c, 0x48, 0x58, 0xc0) // vaddps zmm0,zmm0,zmm0
-DEFINE_INSTCODE(some_avx512bw, 0x62, 0xf1, 0x7d, 0x48, 0xfd, 0xc0) // vpaddw zmm0,zmm0,zmm0
-DEFINE_INSTCODE(some_avx512cd, 0x62, 0xf2, 0xfd, 0x48, 0x44, 0xc0) // vplzcntq zmm0,zmm0
-DEFINE_INSTCODE(some_avx512dq, 0x62, 0xf1, 0x7c, 0x48, 0x54, 0xc0) // vandps zmm0,zmm0,zmm0
-DEFINE_INSTCODE(some_avx512vl, 0x62, 0xf2, 0xfd, 0x28, 0x1f, 0xc0) // vpabsq ymm0,ymm0
-DEFINE_INSTCODE(some_avx512vnni, 0x62, 0xf2, 0x7d, 0x48, 0x52, 0xc0) // vpdpwssd %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avx512bf16, 0x62, 0xf2, 0x7e, 0x48, 0x52, 0xc0) // vdpbf16ps %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avx512ifma, 0x62, 0xf2, 0xfd, 0x48, 0xb4, 0xc0) // vpmadd52luq %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avx512vbmi, 0x62, 0xf2, 0x7d, 0x48, 0x75, 0xc0) // vpermi2b %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avx512vbmi2, 0x62, 0xf2, 0x7d, 0x48, 0x71, 0xc0) // vpshldvd %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avx512fp16, 0x62, 0xf6, 0x7d, 0x48, 0x98, 0xc0) // vfmadd132ph %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avxvnni, 0x62, 0xf2, 0x7d, 0x28, 0x52, 0xc0) // vpdpwssd ymm0,ymm0,ymm0
-DEFINE_INSTCODE(some_avxvnniint8, 0xc4, 0xe2, 0x7f, 0x50, 0xc0) // vpdpbssd ymm0,ymm0,ymm0
-DEFINE_INSTCODE(some_avxifma, 0x62, 0xf2, 0xfd, 0x28, 0xb4, 0xc0) // vpmadd52luq %ymm0,%ymm0,%ymm0
-
-#elif __aarch64__
-DEFINE_INSTCODE(some_neon, 0x4e20d400) // fadd v0.4s,v0.4s,v0.4s
-DEFINE_INSTCODE(some_vfpv4, 0x0e216800) // fcvtn v0.4h,v0.4s
-DEFINE_INSTCODE(some_cpuid, 0xd5380000) // mrs x0,midr_el1
-DEFINE_INSTCODE(some_asimdhp, 0x0e401400) // fadd v0.4h,v0.4h,v0.4h
-DEFINE_INSTCODE(some_asimddp, 0x4e809400) // sdot v0.4h,v0.16b,v0.16b
-DEFINE_INSTCODE(some_asimdfhm, 0x4e20ec00) // fmlal v0.4s,v0.4h,v0.4h
-DEFINE_INSTCODE(some_bf16, 0x6e40ec00) // bfmmla v0.4h,v0.8h,v0.8h
-DEFINE_INSTCODE(some_i8mm, 0x4e80a400) // smmla v0.4h,v0.16b,v0.16b
-DEFINE_INSTCODE(some_sve, 0x65608000) // fmad z0.h,p0/m,z0.h,z0.h
-DEFINE_INSTCODE(some_sve2, 0x44405000) // smlslb z0.h,z0.b,z0.b
-DEFINE_INSTCODE(some_svebf16, 0x6460e400) // bfmmla z0.s,z0.h,z0.h
-DEFINE_INSTCODE(some_svei8mm, 0x45009800) // smmla z0.s,z0.b,z0.b
-DEFINE_INSTCODE(some_svef32mm, 0x64a0e400) // fmmla z0.s,z0.s,z0.s
-
-#elif __arm__
-DEFINE_INSTCODE(some_edsp, 0x0000fb20) // smlad r0,r0,r0,r0
-DEFINE_INSTCODE(some_neon, 0x0d40ef00) // vadd.f32 q0,q0,q0
-DEFINE_INSTCODE(some_vfpv4, 0x0600ffb6) // vcvt.f16.f32 d0,q0
+ size_t size = sizeof(info);
+ sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, NULL, 0);
+ return ((info.kp_proc.p_flag & P_TRACED) != 0);
+#else
+ // unknown platform :(
+ fprintf(stderr, "unknown platform!\n");
+ return false;
#endif
-#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
+}
#if defined __ANDROID__ || defined __linux__
@@ -694,7 +587,7 @@ static int get_cpu_support_x86_avx2()
static int get_cpu_support_x86_avx_vnni()
{
#if __APPLE__
- return detectisa(some_avxvnni);
+ return ruapu_supports("avxvnni");
#else
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
@@ -720,7 +613,7 @@ static int get_cpu_support_x86_avx_vnni()
static int get_cpu_support_x86_avx512()
{
#if __APPLE__
- return detectisa(some_avx512f) && detectisa(some_avx512bw) && detectisa(some_avx512cd) && detectisa(some_avx512dq) && detectisa(some_avx512vl);
+ return ruapu_supports("avx512f") && ruapu_supports("avx512bw") && ruapu_supports("avx512cd") && ruapu_supports("avx512dq") && ruapu_supports("avx512vl");
#else
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
@@ -750,7 +643,7 @@ static int get_cpu_support_x86_avx512()
static int get_cpu_support_x86_avx512_vnni()
{
#if __APPLE__
- return detectisa(some_avx512vnni);
+ return ruapu_supports("avx512vnni");
#else
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
@@ -780,7 +673,7 @@ static int get_cpu_support_x86_avx512_vnni()
static int get_cpu_support_x86_avx512_bf16()
{
#if __APPLE__
- return detectisa(some_avx512bf16);
+ return ruapu_supports("avx512bf16");
#else
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
@@ -806,7 +699,7 @@ static int get_cpu_support_x86_avx512_bf16()
static int get_cpu_support_x86_avx512_fp16()
{
#if __APPLE__
- return detectisa(some_avx512fp16);
+ return ruapu_supports("avx512fp16");
#else
unsigned int cpu_info[4] = {0};
x86_cpuid(0, cpu_info);
@@ -1964,26 +1857,31 @@ static void initialize_global_cpu_info()
g_powersave = 0;
initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
+#if (defined _WIN32 && (__aarch64__ || __arm__)) || __APPLE__
+ if (!is_being_debugged())
+ {
+ ruapu_init();
+ }
+#endif
+
#if defined _WIN32
-#if __arm__
- g_cpu_support_arm_neon = detectisa(some_neon);
- g_cpu_support_arm_vfpv4 = detectisa(some_vfpv4);
#if __aarch64__
- g_cpu_support_arm_cpuid = detectisa(some_cpuid);
- g_cpu_support_arm_asimdhp = detectisa(some_asimdhp);
- g_cpu_support_arm_asimddp = detectisa(some_asimddp);
- g_cpu_support_arm_asimdfhm = detectisa(some_asimdfhm);
- g_cpu_support_arm_bf16 = detectisa(some_bf16);
- g_cpu_support_arm_i8mm = detectisa(some_i8mm);
- g_cpu_support_arm_sve = detectisa(some_sve);
- g_cpu_support_arm_sve2 = detectisa(some_sve2);
- g_cpu_support_arm_svebf16 = detectisa(some_svebf16);
- g_cpu_support_arm_svei8mm = detectisa(some_svei8mm);
- g_cpu_support_arm_svef32mm = detectisa(some_svef32mm);
-#else // __aarch64__
- g_cpu_support_arm_edsp = detectisa(some_edsp);
-#endif // __aarch64__
-#endif // __arm__
+ g_cpu_support_arm_cpuid = ruapu_supports("cpuid");
+ g_cpu_support_arm_asimdhp = ruapu_supports("asimdhp") || IsProcessorFeaturePresent(43); // dp implies hp
+ g_cpu_support_arm_asimddp = ruapu_supports("asimddp") || IsProcessorFeaturePresent(43); // 43 is PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
+ g_cpu_support_arm_asimdfhm = ruapu_supports("asimdfhm");
+ g_cpu_support_arm_bf16 = ruapu_supports("bf16");
+ g_cpu_support_arm_i8mm = ruapu_supports("i8mm");
+ g_cpu_support_arm_sve = ruapu_supports("sve");
+ g_cpu_support_arm_sve2 = ruapu_supports("sve2");
+ g_cpu_support_arm_svebf16 = ruapu_supports("svebf16");
+ g_cpu_support_arm_svei8mm = ruapu_supports("svei8mm");
+ g_cpu_support_arm_svef32mm = ruapu_supports("svef32mm");
+#elif __arm__
+ g_cpu_support_arm_edsp = ruapu_supports("edsp");
+ g_cpu_support_arm_neon = 1; // all modern windows arm devices have neon
+ g_cpu_support_arm_vfpv4 = ruapu_supports("vfpv4");
+#endif // __aarch64__ || __arm__
#elif defined __ANDROID__ || defined __linux__
g_hwcaps = get_elf_hwcap(AT_HWCAP);
g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
@@ -2196,21 +2094,15 @@ int cpu_support_arm_edsp()
int cpu_support_arm_neon()
{
try_initialize_global_cpu_info();
-#if __arm__
+#if __aarch64__
+ return 1;
+#elif __arm__
#if defined _WIN32
return g_cpu_support_arm_neon;
#elif defined __ANDROID__ || defined __linux__
-#if __aarch64__
- return g_hwcaps & HWCAP_ASIMD;
-#else
return g_hwcaps & HWCAP_NEON;
-#endif
#elif __APPLE__
-#if __aarch64__
- return g_hw_cputype == CPU_TYPE_ARM64;
-#else
return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
-#endif
#else
return 0;
#endif
@@ -2222,22 +2114,15 @@ int cpu_support_arm_neon()
int cpu_support_arm_vfpv4()
{
try_initialize_global_cpu_info();
-#if __arm__
+#if __aarch64__
+ return 1;
+#elif __arm__
#if defined _WIN32
return g_cpu_support_arm_vfpv4;
#elif defined __ANDROID__ || defined __linux__
-#if __aarch64__
- // neon always enable fma and fp16
- return g_hwcaps & HWCAP_ASIMD;
-#else
return g_hwcaps & HWCAP_VFPv4;
-#endif
#elif __APPLE__
-#if __aarch64__
- return g_hw_cputype == CPU_TYPE_ARM64;
-#else
return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
-#endif
#else
return 0;
#endif
@@ -2952,7 +2837,7 @@ int get_omp_thread_num()
int get_kmp_blocktime()
{
-#if defined(_OPENMP) && __clang__
+#if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
return kmp_get_blocktime();
#else
return 0;
@@ -2961,7 +2846,7 @@ int get_kmp_blocktime()
void set_kmp_blocktime(int time_ms)
{
-#if defined(_OPENMP) && __clang__
+#if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
kmp_set_blocktime(time_ms);
#else
(void)time_ms;
diff --git a/src/gpu.cpp b/src/gpu.cpp
index 224dcddc235..da26f72f53d 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -26,7 +26,7 @@
#include "glslang/glslang/Public/ShaderLang.h"
#endif
-#include "vulkan_activation.comp.hex.h"
+#include "layer/vulkan/shader/vulkan_activation.comp.hex.h"
#include "command.h"
#include "layer.h"
@@ -51,6 +51,7 @@ class __ncnn_vulkan_instance_holder
{
instance = 0;
created = 0;
+ glslang_initialized = false;
#if NCNN_VULKAN_LOADER
libvulkan = 0;
@@ -76,6 +77,7 @@ class __ncnn_vulkan_instance_holder
VkInstance instance;
int created;
+ bool glslang_initialized;
#if ENABLE_VALIDATION_LAYER
VkDebugUtilsMessengerEXT callback;
@@ -321,9 +323,11 @@ class GpuInfoPrivate
// fp16 and int8 feature
bool support_fp16_packed;
bool support_fp16_storage;
+ bool support_fp16_uniform;
bool support_fp16_arithmetic;
bool support_int8_packed;
bool support_int8_storage;
+ bool support_int8_uniform;
bool support_int8_arithmetic;
// ycbcr conversion feature
@@ -331,6 +335,7 @@ class GpuInfoPrivate
// cooperative matrix
bool support_cooperative_matrix;
+ bool support_cooperative_matrix_8_8_16;
bool support_cooperative_matrix_16_8_8;
bool support_cooperative_matrix_16_8_16;
bool support_cooperative_matrix_16_16_16;
@@ -605,6 +610,11 @@ bool GpuInfo::support_fp16_storage() const
return d->support_fp16_storage;
}
+bool GpuInfo::support_fp16_uniform() const
+{
+ return d->support_fp16_uniform;
+}
+
bool GpuInfo::support_fp16_arithmetic() const
{
return d->support_fp16_arithmetic;
@@ -620,6 +630,11 @@ bool GpuInfo::support_int8_storage() const
return d->support_int8_storage;
}
+bool GpuInfo::support_int8_uniform() const
+{
+ return d->support_int8_uniform;
+}
+
bool GpuInfo::support_int8_arithmetic() const
{
return d->support_int8_arithmetic;
@@ -635,6 +650,11 @@ bool GpuInfo::support_cooperative_matrix() const
return d->support_cooperative_matrix;
}
+bool GpuInfo::support_cooperative_matrix_8_8_16() const
+{
+ return d->support_cooperative_matrix_8_8_16;
+}
+
bool GpuInfo::support_cooperative_matrix_16_8_8() const
{
return d->support_cooperative_matrix_16_8_8;
@@ -1772,12 +1792,15 @@ int create_gpu_instance(const char* driver_path)
// check features
gpu_info.support_fp16_packed = true;
gpu_info.support_fp16_storage = false;
+ gpu_info.support_fp16_uniform = false;
gpu_info.support_fp16_arithmetic = false;
gpu_info.support_int8_packed = true;
gpu_info.support_int8_storage = false;
+ gpu_info.support_int8_uniform = false;
gpu_info.support_int8_arithmetic = false;
gpu_info.support_ycbcr_conversion = false;
gpu_info.support_cooperative_matrix = false;
+ gpu_info.support_cooperative_matrix_8_8_16 = false;
gpu_info.support_cooperative_matrix_16_8_8 = false;
gpu_info.support_cooperative_matrix_16_8_16 = false;
gpu_info.support_cooperative_matrix_16_16_16 = false;
@@ -1852,30 +1875,18 @@ int create_gpu_instance(const char* driver_path)
if (gpu_info.support_VK_KHR_8bit_storage)
{
gpu_info.support_int8_storage = query8BitStorageFeatures.storageBuffer8BitAccess;
+ gpu_info.support_int8_uniform = query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess;
}
if (gpu_info.support_VK_KHR_16bit_storage && queryFeatures.features.shaderStorageImageExtendedFormats)
{
// shaderStorageImageExtendedFormats enables r16f format in storage image
gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess;
+ gpu_info.support_fp16_uniform = query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess;
}
if (gpu_info.support_VK_KHR_shader_float16_int8)
{
- if (gpu_info.support_fp16_storage)
- {
- gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16 && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess;
- }
- else
- {
- gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16;
- }
- if (gpu_info.support_int8_storage)
- {
- gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8 && query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess;
- }
- else
- {
- gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8;
- }
+ gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16;
+ gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8;
}
if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
{
@@ -1945,6 +1956,13 @@ int create_gpu_instance(const char* driver_path)
const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
// NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);
+ if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
+ && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
+ && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
+ && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+ {
+ gpu_info.support_cooperative_matrix_8_8_16 = true;
+ }
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
@@ -1994,6 +2012,13 @@ int create_gpu_instance(const char* driver_path)
const VkCooperativeMatrixPropertiesNV& cmp = properties[j];
// NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);
+ if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
+ && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
+ && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
+ && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+ {
+ gpu_info.support_cooperative_matrix_8_8_16 = true;
+ }
if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
&& cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
&& cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
@@ -2027,17 +2052,17 @@ int create_gpu_instance(const char* driver_path)
NCNN_LOGE("[%u %s] bugsbn1=%d bugbilz=%d bugcopc=%d bugihfa=%d", i, physicalDeviceProperties.deviceName,
gpu_info.bug_storage_buffer_no_l1, gpu_info.bug_buffer_image_load_zero, gpu_info.bug_corrupted_online_pipeline_cache, gpu_info.bug_implicit_fp16_arithmetic);
- NCNN_LOGE("[%u %s] fp16-p/s/a=%d/%d/%d int8-p/s/a=%d/%d/%d", i, physicalDeviceProperties.deviceName,
- gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
- gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);
+ NCNN_LOGE("[%u %s] fp16-p/s/u/a=%d/%d/%d/%d int8-p/s/u/a=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
+ gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_uniform, gpu_info.support_fp16_arithmetic,
+ gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_uniform, gpu_info.support_int8_arithmetic);
NCNN_LOGE("[%u %s] subgroup=%u basic/vote/ballot/shuffle=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote,
gpu_info.support_subgroup_ballot, gpu_info.support_subgroup_shuffle);
- NCNN_LOGE("[%u %s] fp16-matrix-16_8_8/16_8_16/16_16_16=%d/%d/%d", i, physicalDeviceProperties.deviceName,
- gpu_info.support_cooperative_matrix_16_8_8, gpu_info.support_cooperative_matrix_16_8_16,
- gpu_info.support_cooperative_matrix_16_16_16);
+ NCNN_LOGE("[%u %s] fp16-8x8x16/16x8x8/16x8x16/16x16x16=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
+ gpu_info.support_cooperative_matrix_8_8_16, gpu_info.support_cooperative_matrix_16_8_8,
+ gpu_info.support_cooperative_matrix_16_8_16, gpu_info.support_cooperative_matrix_16_16_16);
gpu_info_index++;
}
@@ -2047,7 +2072,7 @@ int create_gpu_instance(const char* driver_path)
// the default gpu device
g_default_gpu_index = find_default_vulkan_device_index();
- glslang::InitializeProcess();
+ g_instance.glslang_initialized = glslang::InitializeProcess();
// the global __ncnn_vulkan_instance_holder destructor will call destroy_gpu_instance() on exit
// but it seems to be too late for nvidia driver :(
@@ -2077,7 +2102,11 @@ void destroy_gpu_instance()
// NCNN_LOGE("destroy_gpu_instance");
- glslang::FinalizeProcess();
+ if (g_instance.glslang_initialized)
+ {
+ glslang::FinalizeProcess();
+ g_instance.glslang_initialized = false;
+ }
for (int i = 0; i < NCNN_MAX_GPU_COUNT; i++)
{
@@ -2089,14 +2118,18 @@ void destroy_gpu_instance()
}
#if ENABLE_VALIDATION_LAYER
- if (support_VK_EXT_debug_utils)
+ if (support_VK_EXT_debug_utils && g_instance.callback)
{
DestroyDebugUtilsMessengerEXT(g_instance, g_instance.callback, NULL);
g_instance.callback = 0;
}
#endif // ENABLE_VALIDATION_LAYER
- vkDestroyInstance(g_instance, 0);
+ if (vkDestroyInstance)
+ {
+ vkDestroyInstance(g_instance, 0);
+ vkDestroyInstance = 0;
+ }
g_instance.instance = 0;
@@ -2481,7 +2514,7 @@ VulkanDevice::VulkanDevice(int device_index)
enabled8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
enabled8BitStorageFeatures.pNext = 0;
enabled8BitStorageFeatures.storageBuffer8BitAccess = info.support_int8_storage();
- enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_storage() && info.support_int8_arithmetic();
+ enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_uniform();
enabled8BitStorageFeatures.storagePushConstant8 = VK_FALSE;
if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_8bit_storage())
{
@@ -2494,7 +2527,7 @@ VulkanDevice::VulkanDevice(int device_index)
enabled16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
enabled16BitStorageFeatures.pNext = 0;
enabled16BitStorageFeatures.storageBuffer16BitAccess = info.support_fp16_storage();
- enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_storage() && info.support_fp16_arithmetic();
+ enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_uniform();
enabled16BitStorageFeatures.storagePushConstant16 = VK_FALSE;
enabled16BitStorageFeatures.storageInputOutput16 = VK_FALSE;
if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_16bit_storage())
@@ -3868,11 +3901,16 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("afpmat4", "mat4"));
}
- if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+ if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("lfp", "float16_t"));
custom_defines.push_back(std::make_pair("lfpvec4", "f16vec4"));
}
+ else if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+ {
+ custom_defines.push_back(std::make_pair("lfp", "float"));
+ custom_defines.push_back(std::make_pair("lfpvec4", "uint64_t"));
+ }
else if (opt.use_fp16_storage || opt.use_fp16_packed)
{
custom_defines.push_back(std::make_pair("lfp", "float"));
@@ -3884,7 +3922,7 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("lfpvec4", "vec4"));
}
- if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+ if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v"));
@@ -3892,6 +3930,14 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("lfp2afp(v)", "v"));
custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "v"));
}
+ else if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+ {
+ custom_defines.push_back(std::make_pair("sfp2lfp(v)", "float(v)"));
+ custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "pack64(halfBitsToUInt16(v))"));
+
+ custom_defines.push_back(std::make_pair("lfp2afp(v)", "float16_t(v)"));
+ custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "int16BitsToHalf(unpack16(v))"));
+ }
else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
@@ -4219,6 +4265,11 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("NCNN_fp16_packed", "1"));
}
+ if (opt.use_fp16_uniform)
+ {
+ custom_defines.push_back(std::make_pair("NCNN_fp16_uniform", "1"));
+ }
+
if (opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("NCNN_fp16_arithmetic", "1"));
@@ -4233,6 +4284,11 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("NCNN_int8_packed", "1"));
}
+ if (opt.use_int8_uniform)
+ {
+ custom_defines.push_back(std::make_pair("NCNN_int8_uniform", "1"));
+ }
+
if (opt.use_int8_arithmetic)
{
custom_defines.push_back(std::make_pair("NCNN_int8_arithmetic", "1"));
diff --git a/src/gpu.h b/src/gpu.h
index c3e5e8daeac..696f651ed2b 100644
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -260,9 +260,11 @@ class NCNN_EXPORT GpuInfo
// fp16 and int8 feature
bool support_fp16_packed() const;
bool support_fp16_storage() const;
+ bool support_fp16_uniform() const;
bool support_fp16_arithmetic() const;
bool support_int8_packed() const;
bool support_int8_storage() const;
+ bool support_int8_uniform() const;
bool support_int8_arithmetic() const;
// ycbcr conversion feature
@@ -270,6 +272,7 @@ class NCNN_EXPORT GpuInfo
// cooperative matrix feature
bool support_cooperative_matrix() const;
+ bool support_cooperative_matrix_8_8_16() const;
bool support_cooperative_matrix_16_8_8() const;
bool support_cooperative_matrix_16_8_16() const;
bool support_cooperative_matrix_16_16_16() const;
diff --git a/src/layer.cpp b/src/layer.cpp
index 562576a5493..cca3e77bf1f 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -18,21 +18,7 @@
#include
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4250)
-#endif
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Woverloaded-virtual"
-#endif
#include "layer_declaration.h"
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
namespace ncnn {
@@ -221,9 +207,289 @@ Layer* create_layer(const char* type)
return create_layer(index);
}
+
+Layer* create_layer_naive(const char* type)
+{
+ int index = layer_to_index(type);
+ if (index == -1)
+ return 0;
+
+ return create_layer_naive(index);
+}
+
+Layer* create_layer_cpu(const char* type)
+{
+ int index = layer_to_index(type);
+ if (index == -1)
+ return 0;
+
+ return create_layer_cpu(index);
+}
+
+#if NCNN_VULKAN
+Layer* create_layer_vulkan(const char* type)
+{
+ int index = layer_to_index(type);
+ if (index == -1)
+ return 0;
+
+ return create_layer_vulkan(index);
+}
+#endif // NCNN_VULKAN
#endif // NCNN_STRING
+// internal wrapper
+class Layer_final : public Layer
+{
+public:
+ Layer* layer_cpu;
+#if NCNN_VULKAN
+ Layer* layer_vulkan;
+#endif
+
+ // utility functions for transfer layer properties
+ void set_layer_properties()
+ {
+ layer_cpu->userdata = userdata;
+
+ layer_cpu->bottoms = bottoms;
+ layer_cpu->tops = tops;
+ layer_cpu->bottom_shapes = bottom_shapes;
+ layer_cpu->top_shapes = top_shapes;
+ layer_cpu->featmask = featmask;
+
+#if NCNN_VULKAN
+ if (layer_vulkan)
+ {
+ layer_vulkan->vkdev = vkdev;
+
+ layer_vulkan->userdata = userdata;
+
+ layer_vulkan->bottoms = bottoms;
+ layer_vulkan->tops = tops;
+ layer_vulkan->bottom_shapes = bottom_shapes;
+ layer_vulkan->top_shapes = top_shapes;
+ layer_vulkan->featmask = featmask;
+ }
+#endif
+ }
+
+ void get_layer_properties()
+ {
+ one_blob_only = layer_cpu->one_blob_only;
+ support_inplace = layer_cpu->support_inplace;
+ support_packing = layer_cpu->support_packing;
+ support_bf16_storage = layer_cpu->support_bf16_storage;
+ support_fp16_storage = layer_cpu->support_fp16_storage;
+ support_int8_storage = layer_cpu->support_int8_storage;
+
+ support_vulkan = 0;
+ support_image_storage = 0;
+ support_tensor_storage = 0;
+
+#if NCNN_VULKAN
+ if (layer_vulkan)
+ {
+ support_vulkan = layer_vulkan->support_vulkan;
+ support_image_storage = layer_vulkan->support_image_storage;
+ support_tensor_storage = layer_vulkan->support_tensor_storage;
+ }
+#endif
+ }
+
+public:
+ Layer_final()
+ {
+ layer_cpu = 0;
+#if NCNN_VULKAN
+ layer_vulkan = 0;
+#endif
+ }
+
+ ~Layer_final()
+ {
+ delete layer_cpu;
+#if NCNN_VULKAN
+ delete layer_vulkan;
+#endif
+ }
+
+ virtual int load_param(const ParamDict& pd)
+ {
+ set_layer_properties();
+#if NCNN_VULKAN
+ if (layer_vulkan)
+ {
+ if (vkdev)
+ {
+ int ret = layer_vulkan->load_param(pd);
+ get_layer_properties();
+
+ if (layer_vulkan->support_vulkan)
+ return ret;
+ }
+
+ // fallback to cpu layer
+ delete layer_vulkan;
+ layer_vulkan = 0;
+ }
+#endif // NCNN_VULKAN
+
+ int ret = layer_cpu->load_param(pd);
+ get_layer_properties();
+ return ret;
+ }
+
+ virtual int load_model(const ModelBin& mb)
+ {
+#if NCNN_VULKAN
+ if (layer_vulkan)
+ {
+ int ret = layer_vulkan->load_model(mb);
+ get_layer_properties();
+ return ret;
+ }
+#endif // NCNN_VULKAN
+
+ int ret = layer_cpu->load_model(mb);
+ get_layer_properties();
+ return ret;
+ }
+
+ virtual int create_pipeline(const Option& opt)
+ {
+ set_layer_properties();
+#if NCNN_VULKAN
+ if (layer_vulkan)
+ {
+ int ret = layer_vulkan->create_pipeline(opt);
+ get_layer_properties();
+ return ret;
+ }
+#endif // NCNN_VULKAN
+
+ int ret = layer_cpu->create_pipeline(opt);
+ get_layer_properties();
+ return ret;
+ }
+
+ virtual int destroy_pipeline(const Option& opt)
+ {
+#if NCNN_VULKAN
+ if (layer_vulkan)
+ {
+ return layer_vulkan->destroy_pipeline(opt);
+ }
+#endif // NCNN_VULKAN
+
+ return layer_cpu->destroy_pipeline(opt);
+ }
+
+public:
+ virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const
+ {
+ return layer_cpu->forward(bottom_blobs, top_blobs, opt);
+ }
+
+ virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+ {
+ return layer_cpu->forward(bottom_blob, top_blob, opt);
+ }
+
+ virtual int forward_inplace(std::vector& bottom_top_blobs, const Option& opt) const
+ {
+ return layer_cpu->forward_inplace(bottom_top_blobs, opt);
+ }
+
+ virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+ {
+ return layer_cpu->forward_inplace(bottom_top_blob, opt);
+ }
+
+#if NCNN_VULKAN
+public:
+ virtual int upload_model(VkTransfer& cmd, const Option& opt)
+ {
+ return layer_vulkan ? layer_vulkan->upload_model(cmd, opt) : -1;
+ }
+
+ virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const
+ {
+ return layer_vulkan ? layer_vulkan->forward(bottom_blobs, top_blobs, cmd, opt) : -1;
+ }
+
+ virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+ {
+ return layer_vulkan ? layer_vulkan->forward(bottom_blob, top_blob, cmd, opt) : -1;
+ }
+
+ virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const
+ {
+ return layer_vulkan ? layer_vulkan->forward(bottom_blobs, top_blobs, cmd, opt) : -1;
+ }
+
+ virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
+ {
+ return layer_vulkan ? layer_vulkan->forward(bottom_blob, top_blob, cmd, opt) : -1;
+ }
+
+ virtual int forward_inplace(std::vector& bottom_top_blobs, VkCompute& cmd, const Option& opt) const
+ {
+ return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blobs, cmd, opt) : -1;
+ }
+
+ virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+ {
+ return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blob, cmd, opt) : -1;
+ }
+
+ virtual int forward_inplace(std::vector& bottom_top_blobs, VkCompute& cmd, const Option& opt) const
+ {
+ return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blobs, cmd, opt) : -1;
+ }
+
+ virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+ {
+ return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blob, cmd, opt) : -1;
+ }
+#endif // NCNN_VULKAN
+};
+
Layer* create_layer(int index)
+{
+ Layer* layer_cpu = create_layer_cpu(index);
+ if (!layer_cpu)
+ return 0;
+
+ Layer_final* layer_final = new Layer_final;
+ layer_final->layer_cpu = layer_cpu;
+
+#if NCNN_VULKAN
+ layer_final->layer_vulkan = create_layer_vulkan(index);
+#endif
+
+ layer_final->typeindex = index;
+ layer_final->set_layer_properties();
+ layer_final->get_layer_properties();
+
+ return layer_final;
+}
+
+Layer* create_layer_naive(int index)
+{
+ if (index < 0 || index >= layer_registry_entry_count)
+ return 0;
+
+ layer_creator_func layer_creator = layer_registry[index].creator;
+ if (!layer_creator)
+ return 0;
+
+ Layer* layer = layer_creator(0);
+ layer->typeindex = index;
+ return layer;
+}
+
+Layer* create_layer_cpu(int index)
{
if (index < 0 || index >= layer_registry_entry_count)
return 0;
@@ -280,6 +546,11 @@ Layer* create_layer(int index)
}
else
#endif // NCNN_RUNTIME_CPU && NCNN_RVV
+ {
+ layer_creator = layer_registry_arch[index].creator;
+ }
+
+ if (!layer_creator)
{
layer_creator = layer_registry[index].creator;
}
@@ -293,4 +564,20 @@ Layer* create_layer(int index)
return layer;
}
+#if NCNN_VULKAN
+Layer* create_layer_vulkan(int index)
+{
+ if (index < 0 || index >= layer_registry_entry_count)
+ return 0;
+
+ layer_creator_func layer_creator = layer_registry_vulkan[index].creator;
+ if (!layer_creator)
+ return 0;
+
+ Layer* layer = layer_creator(0);
+ layer->typeindex = index;
+ return layer;
+}
+#endif // NCNN_VULKAN
+
} // namespace ncnn
diff --git a/src/layer.h b/src/layer.h
index 573f58cf94a..d44713de451 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -199,9 +199,19 @@ struct overwrite_builtin_layer_registry_entry
NCNN_EXPORT int layer_to_index(const char* type);
// create layer from type name
NCNN_EXPORT Layer* create_layer(const char* type);
+NCNN_EXPORT Layer* create_layer_naive(const char* type);
+NCNN_EXPORT Layer* create_layer_cpu(const char* type);
+#if NCNN_VULKAN
+NCNN_EXPORT Layer* create_layer_vulkan(const char* type);
+#endif // NCNN_VULKAN
#endif // NCNN_STRING
// create layer from layer type
NCNN_EXPORT Layer* create_layer(int index);
+NCNN_EXPORT Layer* create_layer_naive(int index);
+NCNN_EXPORT Layer* create_layer_cpu(int index);
+#if NCNN_VULKAN
+NCNN_EXPORT Layer* create_layer_vulkan(int index);
+#endif // NCNN_VULKAN
#define DEFINE_LAYER_CREATOR(name) \
::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
diff --git a/src/layer/arm/absval_arm.h b/src/layer/arm/absval_arm.h
index 7e6f150e7f1..c1cea1dfb9e 100644
--- a/src/layer/arm/absval_arm.h
+++ b/src/layer/arm/absval_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class AbsVal_arm : virtual public AbsVal
+class AbsVal_arm : public AbsVal
{
public:
AbsVal_arm();
diff --git a/src/layer/arm/batchnorm_arm.h b/src/layer/arm/batchnorm_arm.h
index 9be82439cb4..1393bb30e12 100644
--- a/src/layer/arm/batchnorm_arm.h
+++ b/src/layer/arm/batchnorm_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BatchNorm_arm : virtual public BatchNorm
+class BatchNorm_arm : public BatchNorm
{
public:
BatchNorm_arm();
diff --git a/src/layer/arm/bias_arm.h b/src/layer/arm/bias_arm.h
index a3b61cd300d..5f08facf17e 100644
--- a/src/layer/arm/bias_arm.h
+++ b/src/layer/arm/bias_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Bias_arm : virtual public Bias
+class Bias_arm : public Bias
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/arm/binaryop_arm.h b/src/layer/arm/binaryop_arm.h
index 6bb950495ce..1337065eb40 100644
--- a/src/layer/arm/binaryop_arm.h
+++ b/src/layer/arm/binaryop_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BinaryOp_arm : virtual public BinaryOp
+class BinaryOp_arm : public BinaryOp
{
public:
BinaryOp_arm();
diff --git a/src/layer/arm/cast_arm.h b/src/layer/arm/cast_arm.h
index 190090a859a..fc32c70d3dd 100644
--- a/src/layer/arm/cast_arm.h
+++ b/src/layer/arm/cast_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Cast_arm : virtual public Cast
+class Cast_arm : public Cast
{
public:
Cast_arm();
diff --git a/src/layer/arm/cast_bf16.h b/src/layer/arm/cast_bf16.h
index 5ae5513145d..ab196687154 100644
--- a/src/layer/arm/cast_bf16.h
+++ b/src/layer/arm/cast_bf16.h
@@ -88,7 +88,7 @@ static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const
"vshrn.u32 d1, q1, #16 \n"
"vshrn.u32 d2, q2, #16 \n"
"vshrn.u32 d3, q3, #16 \n"
- "vst1.u16 {d0-d3}, [%1 :128]! \n"
+ "vst1.u16 {d0-d3}, [%1]! \n"
: "=r"(ptr), // %0
"=r"(outptr) // %1
: "0"(ptr),
@@ -231,7 +231,7 @@ static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
#else // __aarch64__
asm volatile(
"pld [%0, #256] \n"
- "vld1.u16 {d4-d7}, [%0 :128]! \n"
+ "vld1.u16 {d4-d7}, [%0]! \n"
"vshll.u16 q0, d4, #16 \n"
"vshll.u16 q1, d5, #16 \n"
"vshll.u16 q2, d6, #16 \n"
diff --git a/src/layer/arm/cast_fp16.h b/src/layer/arm/cast_fp16.h
index b27a6ebd34f..7d5866d0a19 100644
--- a/src/layer/arm/cast_fp16.h
+++ b/src/layer/arm/cast_fp16.h
@@ -62,13 +62,13 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
: "memory", "v0", "v1", "v2", "v3");
#else // __aarch64__
asm volatile(
- "pld [%0, #512] \n"
- "vldm %0!, {d0-d7} \n"
- "vcvt.f16.f32 d0, q0 \n"
- "vcvt.f16.f32 d1, q1 \n"
- "vcvt.f16.f32 d2, q2 \n"
- "vcvt.f16.f32 d3, q3 \n"
- "vst1.u16 {d0-d3}, [%1 :128]! \n"
+ "pld [%0, #512] \n"
+ "vldm %0!, {d0-d7} \n"
+ "vcvt.f16.f32 d0, q0 \n"
+ "vcvt.f16.f32 d1, q1 \n"
+ "vcvt.f16.f32 d2, q2 \n"
+ "vcvt.f16.f32 d3, q3 \n"
+ "vst1.u16 {d0-d3}, [%1]! \n"
: "=r"(ptr), // %0
"=r"(outptr) // %1
: "0"(ptr),
@@ -220,13 +220,13 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
: "memory", "v0", "v1", "v2", "v3");
#else // __aarch64__
asm volatile(
- "pld [%0, #256] \n"
- "vld1.u16 {d4-d7}, [%0 :128]! \n"
- "vcvt.f32.f16 q0, d4 \n"
- "vcvt.f32.f16 q1, d5 \n"
- "vcvt.f32.f16 q2, d6 \n"
- "vcvt.f32.f16 q3, d7 \n"
- "vstm %1!, {d0-d7} \n"
+ "pld [%0, #256] \n"
+ "vld1.u16 {d4-d7}, [%0]! \n"
+ "vcvt.f32.f16 q0, d4 \n"
+ "vcvt.f32.f16 q1, d5 \n"
+ "vcvt.f32.f16 q2, d6 \n"
+ "vcvt.f32.f16 q3, d7 \n"
+ "vstm %1!, {d0-d7} \n"
: "=r"(ptr), // %0
"=r"(outptr) // %1
: "0"(ptr),
diff --git a/src/layer/arm/clip_arm.h b/src/layer/arm/clip_arm.h
index 8af695172e1..ef281d249e7 100644
--- a/src/layer/arm/clip_arm.h
+++ b/src/layer/arm/clip_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Clip_arm : virtual public Clip
+class Clip_arm : public Clip
{
public:
Clip_arm();
diff --git a/src/layer/arm/concat_arm.h b/src/layer/arm/concat_arm.h
index c09dfa27568..9491a280110 100644
--- a/src/layer/arm/concat_arm.h
+++ b/src/layer/arm/concat_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Concat_arm : virtual public Concat
+class Concat_arm : public Concat
{
public:
Concat_arm();
diff --git a/src/layer/arm/convolution1d_arm.cpp b/src/layer/arm/convolution1d_arm.cpp
index 48368fb9cc6..26389279b18 100644
--- a/src/layer/arm/convolution1d_arm.cpp
+++ b/src/layer/arm/convolution1d_arm.cpp
@@ -68,6 +68,8 @@ int Convolution1D_arm::create_pipeline(const Option& opt)
convolution1d_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w);
+ weight_data.release();
+
return 0;
}
@@ -196,7 +198,7 @@ int Convolution1D_arm::forward(const std::vector& bottom_blobs, std::vector
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -237,6 +239,8 @@ int Convolution1D_arm::create_pipeline_bf16s(const Option& /*opt*/)
convolution1d_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w);
+ weight_data.release();
+
return 0;
}
diff --git a/src/layer/arm/convolution1d_arm.h b/src/layer/arm/convolution1d_arm.h
index 83e0ea83809..48babb914d2 100644
--- a/src/layer/arm/convolution1d_arm.h
+++ b/src/layer/arm/convolution1d_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Convolution1D_arm : virtual public Convolution1D
+class Convolution1D_arm : public Convolution1D
{
public:
Convolution1D_arm();
diff --git a/src/layer/arm/convolution1d_arm_asimdhp.cpp b/src/layer/arm/convolution1d_arm_asimdhp.cpp
index bbbd5883027..2e194eabf21 100644
--- a/src/layer/arm/convolution1d_arm_asimdhp.cpp
+++ b/src/layer/arm/convolution1d_arm_asimdhp.cpp
@@ -36,6 +36,8 @@ int Convolution1D_arm::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
+ weight_data.release();
+
return 0;
}
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index cde8c216873..f7f04619e9e 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -157,7 +157,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
if ((!support_packing || !opt.use_packing_layout) && !opt.use_bf16_storage && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
{
- convolution_dilation1 = ncnn::create_layer(ncnn::LayerType::Convolution);
+ convolution_dilation1 = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
// set param
ncnn::ParamDict pd;
@@ -194,6 +194,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
convolution_dilation1->create_pipeline(opt);
+ weight_data.release();
+
return 0;
}
@@ -222,10 +224,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
else
conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -271,10 +270,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
{
convolution_im2col_gemm_transform_kernel(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -309,10 +305,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
convolution_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -807,7 +800,7 @@ int Convolution_arm::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std:
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -1031,10 +1022,7 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
weight_data_tm = weight_data;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -1042,10 +1030,7 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/arm/convolutiondepthwise_arm.h b/src/layer/arm/convolutiondepthwise_arm.h
index 412590f101e..8536c081320 100644
--- a/src/layer/arm/convolutiondepthwise_arm.h
+++ b/src/layer/arm/convolutiondepthwise_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ConvolutionDepthWise_arm : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_arm : public ConvolutionDepthWise
{
public:
ConvolutionDepthWise_arm();
diff --git a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
index f7d2cfee84c..1d5f2782cc1 100644
--- a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
@@ -76,10 +76,7 @@ int ConvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -87,10 +84,7 @@ int ConvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/arm/crop_arm.h b/src/layer/arm/crop_arm.h
index e3f6d5109a3..9f2bea6e1bd 100644
--- a/src/layer/arm/crop_arm.h
+++ b/src/layer/arm/crop_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Crop_arm : virtual public Crop
+class Crop_arm : public Crop
{
public:
Crop_arm();
diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp
index ef8a56f9931..24c825ae266 100644
--- a/src/layer/arm/deconvolution_arm.cpp
+++ b/src/layer/arm/deconvolution_arm.cpp
@@ -85,7 +85,7 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
{
const int maxk = kernel_w * kernel_h;
- gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 1); // transA
@@ -211,10 +211,7 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -851,7 +848,7 @@ int Deconvolution_arm::forward(const std::vector& bottom_blobs, std::vector
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -957,10 +954,7 @@ int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/arm/deconvolution_arm.h b/src/layer/arm/deconvolution_arm.h
index 3c7979687cb..b4cdcbe0ee9 100644
--- a/src/layer/arm/deconvolution_arm.h
+++ b/src/layer/arm/deconvolution_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Deconvolution_arm : virtual public Deconvolution
+class Deconvolution_arm : public Deconvolution
{
public:
Deconvolution_arm();
diff --git a/src/layer/arm/deconvolution_arm_asimdhp.cpp b/src/layer/arm/deconvolution_arm_asimdhp.cpp
index c98ba40309b..b5498d815f3 100644
--- a/src/layer/arm/deconvolution_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolution_arm_asimdhp.cpp
@@ -45,7 +45,7 @@ int Deconvolution_arm::create_pipeline_fp16s(const Option& opt)
{
const int maxk = kernel_w * kernel_h;
- gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 1); // transA
@@ -154,10 +154,7 @@ int Deconvolution_arm::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp
index 478bd1740dc..4eac426d9de 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -104,10 +104,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
ncnn::cast_float32_to_bfloat16(weight_data_transposed, weight_data_tm, opt);
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -148,7 +145,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
// set param
ncnn::ParamDict pd;
@@ -193,10 +190,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -562,7 +556,7 @@ int DeconvolutionDepthWise_arm::forward(const std::vector& bottom_blobs, st
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
ncnn::ParamDict pd;
pd.set(0, _num_output);
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.h b/src/layer/arm/deconvolutiondepthwise_arm.h
index 6eff45ede3a..a7ef393dd25 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.h
+++ b/src/layer/arm/deconvolutiondepthwise_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class DeconvolutionDepthWise_arm : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_arm : public DeconvolutionDepthWise
{
public:
DeconvolutionDepthWise_arm();
diff --git a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
index 09e0fca4356..5fa42d07490 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
@@ -100,7 +100,7 @@ int DeconvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
// set param
ncnn::ParamDict pd;
@@ -145,10 +145,7 @@ int DeconvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/arm/dequantize_arm.h b/src/layer/arm/dequantize_arm.h
index 5bba8de7fdd..677c731db69 100644
--- a/src/layer/arm/dequantize_arm.h
+++ b/src/layer/arm/dequantize_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Dequantize_arm : virtual public Dequantize
+class Dequantize_arm : public Dequantize
{
public:
Dequantize_arm();
diff --git a/src/layer/arm/dropout_arm.h b/src/layer/arm/dropout_arm.h
index 395c5a9d02c..9a970525aae 100644
--- a/src/layer/arm/dropout_arm.h
+++ b/src/layer/arm/dropout_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Dropout_arm : virtual public Dropout
+class Dropout_arm : public Dropout
{
public:
Dropout_arm();
diff --git a/src/layer/arm/eltwise_arm.h b/src/layer/arm/eltwise_arm.h
index 5480f2293ce..6bd91f5dab5 100644
--- a/src/layer/arm/eltwise_arm.h
+++ b/src/layer/arm/eltwise_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Eltwise_arm : virtual public Eltwise
+class Eltwise_arm : public Eltwise
{
public:
Eltwise_arm();
diff --git a/src/layer/arm/flatten_arm.h b/src/layer/arm/flatten_arm.h
index 92932ba7744..9bc9a0d1b99 100644
--- a/src/layer/arm/flatten_arm.h
+++ b/src/layer/arm/flatten_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Flatten_arm : virtual public Flatten
+class Flatten_arm : public Flatten
{
public:
Flatten_arm();
diff --git a/src/layer/arm/gelu_arm.h b/src/layer/arm/gelu_arm.h
index 283f063bb69..5be9fc4d6d5 100644
--- a/src/layer/arm/gelu_arm.h
+++ b/src/layer/arm/gelu_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class GELU_arm : virtual public GELU
+class GELU_arm : public GELU
{
public:
GELU_arm();
diff --git a/src/layer/arm/gemm_arm.cpp b/src/layer/arm/gemm_arm.cpp
index 2d4ff8734f8..3463550d3d4 100644
--- a/src/layer/arm/gemm_arm.cpp
+++ b/src/layer/arm/gemm_arm.cpp
@@ -4201,10 +4201,7 @@ int Gemm_arm::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- A_data.release();
- }
+ A_data.release();
}
if (constantB)
@@ -4244,10 +4241,7 @@ int Gemm_arm::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- B_data.release();
- }
+ B_data.release();
}
if (constantC && constant_broadcast_type_C != -1)
@@ -4277,10 +4271,7 @@ int Gemm_arm::create_pipeline(const Option& opt)
CT_data = C2;
}
- if (opt.lightmode)
- {
- C_data.release();
- }
+ C_data.release();
}
if (constantA || constantB || constantC)
@@ -4898,10 +4889,7 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
}
}
- if (opt.lightmode)
- {
- A_data.release();
- }
+ A_data.release();
}
if (constantB)
@@ -4941,10 +4929,7 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
}
}
- if (opt.lightmode)
- {
- B_data.release();
- }
+ B_data.release();
}
if (constantC && constant_broadcast_type_C != -1)
@@ -4974,10 +4959,7 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
CT_data = C2;
}
- if (opt.lightmode)
- {
- C_data.release();
- }
+ C_data.release();
}
if (constantA || constantB || constantC)
diff --git a/src/layer/arm/gemm_arm.h b/src/layer/arm/gemm_arm.h
index e4e4b81f2ee..0c1eab108ba 100644
--- a/src/layer/arm/gemm_arm.h
+++ b/src/layer/arm/gemm_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Gemm_arm : virtual public Gemm
+class Gemm_arm : public Gemm
{
public:
Gemm_arm();
diff --git a/src/layer/arm/gemm_arm_asimdhp.cpp b/src/layer/arm/gemm_arm_asimdhp.cpp
index ff840df3b50..cfe6ce8ce60 100644
--- a/src/layer/arm/gemm_arm_asimdhp.cpp
+++ b/src/layer/arm/gemm_arm_asimdhp.cpp
@@ -2736,10 +2736,7 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
}
}
- if (opt.lightmode)
- {
- A_data.release();
- }
+ A_data.release();
}
if (constantB)
@@ -2779,10 +2776,7 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
}
}
- if (opt.lightmode)
- {
- B_data.release();
- }
+ B_data.release();
}
if (constantC && constant_broadcast_type_C != -1)
@@ -2808,10 +2802,7 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
}
}
- if (opt.lightmode)
- {
- C_data.release();
- }
+ C_data.release();
}
if (constantA || constantB || constantC)
diff --git a/src/layer/arm/gemm_arm_vfpv4.cpp b/src/layer/arm/gemm_arm_vfpv4.cpp
index 3d29af41860..5792e47e980 100644
--- a/src/layer/arm/gemm_arm_vfpv4.cpp
+++ b/src/layer/arm/gemm_arm_vfpv4.cpp
@@ -427,10 +427,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
}
}
- if (opt.lightmode)
- {
- A_data.release();
- }
+ A_data.release();
}
if (constantB)
@@ -470,10 +467,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
}
}
- if (opt.lightmode)
- {
- B_data.release();
- }
+ B_data.release();
}
if (constantC && constant_broadcast_type_C != -1)
@@ -503,10 +497,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
CT_data = C2;
}
- if (opt.lightmode)
- {
- C_data.release();
- }
+ C_data.release();
}
if (constantA || constantB || constantC)
diff --git a/src/layer/arm/gru_arm.cpp b/src/layer/arm/gru_arm.cpp
index 70df351a555..58df8275ad5 100644
--- a/src/layer/arm/gru_arm.cpp
+++ b/src/layer/arm/gru_arm.cpp
@@ -250,6 +250,10 @@ int GRU_arm::create_pipeline(const Option& opt)
}
}
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
+
return 0;
}
@@ -1368,6 +1372,10 @@ int GRU_arm::create_pipeline_bf16s(const Option& opt)
}
}
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
+
return 0;
}
diff --git a/src/layer/arm/gru_arm.h b/src/layer/arm/gru_arm.h
index e1e8fbb08fd..6eae1656b01 100644
--- a/src/layer/arm/gru_arm.h
+++ b/src/layer/arm/gru_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class GRU_arm : virtual public GRU
+class GRU_arm : public GRU
{
public:
GRU_arm();
diff --git a/src/layer/arm/gru_arm_asimdhp.cpp b/src/layer/arm/gru_arm_asimdhp.cpp
index c38458176af..fcdce2d8e18 100644
--- a/src/layer/arm/gru_arm_asimdhp.cpp
+++ b/src/layer/arm/gru_arm_asimdhp.cpp
@@ -914,6 +914,10 @@ int GRU_arm::create_pipeline_fp16s(const Option& opt)
}
}
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
+
return 0;
}
diff --git a/src/layer/arm/hardsigmoid_arm.h b/src/layer/arm/hardsigmoid_arm.h
index bfa04828ac6..13783ff1690 100644
--- a/src/layer/arm/hardsigmoid_arm.h
+++ b/src/layer/arm/hardsigmoid_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSigmoid_arm : virtual public HardSigmoid
+class HardSigmoid_arm : public HardSigmoid
{
public:
HardSigmoid_arm();
diff --git a/src/layer/arm/hardswish_arm.h b/src/layer/arm/hardswish_arm.h
index 7309ba6c71f..a534ceb1677 100644
--- a/src/layer/arm/hardswish_arm.h
+++ b/src/layer/arm/hardswish_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSwish_arm : virtual public HardSwish
+class HardSwish_arm : public HardSwish
{
public:
HardSwish_arm();
diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp
index 98eda2d171b..0cbc78525eb 100644
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -46,7 +46,7 @@ InnerProduct_arm::InnerProduct_arm()
int InnerProduct_arm::create_pipeline(const Option& opt)
{
{
- flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+ flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
ncnn::ParamDict pd;
@@ -122,10 +122,7 @@ int InnerProduct_arm::create_pipeline(const Option& opt)
weight_data_tm = weight_data;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -868,10 +865,7 @@ int InnerProduct_arm::create_pipeline_bf16s(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -1264,10 +1258,7 @@ int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt)
scale_in_data[p] = scale_in;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h
index f1eee178f9c..70a54533151 100644
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class InnerProduct_arm : virtual public InnerProduct
+class InnerProduct_arm : public InnerProduct
{
public:
InnerProduct_arm();
diff --git a/src/layer/arm/innerproduct_arm_vfpv4.cpp b/src/layer/arm/innerproduct_arm_vfpv4.cpp
index 435fb883e50..6a6eab84fba 100644
--- a/src/layer/arm/innerproduct_arm_vfpv4.cpp
+++ b/src/layer/arm/innerproduct_arm_vfpv4.cpp
@@ -41,10 +41,7 @@ int InnerProduct_arm::create_pipeline_fp16s(const Option& opt)
}
#endif
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/arm/instancenorm_arm.h b/src/layer/arm/instancenorm_arm.h
index 102c49fe2b0..98dec71ac48 100644
--- a/src/layer/arm/instancenorm_arm.h
+++ b/src/layer/arm/instancenorm_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class InstanceNorm_arm : virtual public InstanceNorm
+class InstanceNorm_arm : public InstanceNorm
{
public:
InstanceNorm_arm();
diff --git a/src/layer/arm/interp_arm.h b/src/layer/arm/interp_arm.h
index 5ea9873ae78..6c15c73801b 100644
--- a/src/layer/arm/interp_arm.h
+++ b/src/layer/arm/interp_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Interp_arm : virtual public Interp
+class Interp_arm : public Interp
{
public:
Interp_arm();
diff --git a/src/layer/arm/lrn_arm.h b/src/layer/arm/lrn_arm.h
index db9a04e0adb..f2c43ba08f2 100644
--- a/src/layer/arm/lrn_arm.h
+++ b/src/layer/arm/lrn_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class LRN_arm : virtual public LRN
+class LRN_arm : public LRN
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/arm/lstm_arm.cpp b/src/layer/arm/lstm_arm.cpp
index 04d7277547e..b8d5afe93dc 100644
--- a/src/layer/arm/lstm_arm.cpp
+++ b/src/layer/arm/lstm_arm.cpp
@@ -124,12 +124,9 @@ int LSTM_arm::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_xc_data.release();
- bias_c_data.release();
- weight_hc_data.release();
- }
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
return 0;
}
@@ -931,12 +928,9 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_xc_data.release();
- bias_c_data.release();
- weight_hc_data.release();
- }
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
return 0;
}
diff --git a/src/layer/arm/lstm_arm.h b/src/layer/arm/lstm_arm.h
index a42dff28823..b5ee1092a52 100644
--- a/src/layer/arm/lstm_arm.h
+++ b/src/layer/arm/lstm_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class LSTM_arm : virtual public LSTM
+class LSTM_arm : public LSTM
{
public:
LSTM_arm();
diff --git a/src/layer/arm/lstm_arm_asimdhp.cpp b/src/layer/arm/lstm_arm_asimdhp.cpp
index 1d3fc71cdfc..593af33ccd4 100644
--- a/src/layer/arm/lstm_arm_asimdhp.cpp
+++ b/src/layer/arm/lstm_arm_asimdhp.cpp
@@ -835,12 +835,9 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_xc_data.release();
- bias_c_data.release();
- weight_hc_data.release();
- }
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
return 0;
}
diff --git a/src/layer/arm/matmul_arm.cpp b/src/layer/arm/matmul_arm.cpp
index 7117ce49511..363ab4490bb 100644
--- a/src/layer/arm/matmul_arm.cpp
+++ b/src/layer/arm/matmul_arm.cpp
@@ -37,7 +37,7 @@ MatMul_arm::MatMul_arm()
int MatMul_arm::create_pipeline(const Option& opt)
{
- gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 0); // transA
diff --git a/src/layer/arm/matmul_arm.h b/src/layer/arm/matmul_arm.h
index 4d4784ce50d..a4537300d5a 100644
--- a/src/layer/arm/matmul_arm.h
+++ b/src/layer/arm/matmul_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class MatMul_arm : virtual public MatMul
+class MatMul_arm : public MatMul
{
public:
MatMul_arm();
diff --git a/src/layer/arm/mish_arm.h b/src/layer/arm/mish_arm.h
index 708611589f4..9f99a7a1200 100644
--- a/src/layer/arm/mish_arm.h
+++ b/src/layer/arm/mish_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Mish_arm : virtual public Mish
+class Mish_arm : public Mish
{
public:
Mish_arm();
diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp
index 15eca715699..b3f3d7aa8e7 100644
--- a/src/layer/arm/multiheadattention_arm.cpp
+++ b/src/layer/arm/multiheadattention_arm.cpp
@@ -48,7 +48,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
opt.use_bf16_storage &= support_bf16_storage;
{
- qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+ qk_softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
ncnn::ParamDict pd;
pd.set(0, -1);
pd.set(1, 1);
@@ -61,7 +61,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
const int embed_dim_per_head = embed_dim / num_heads;
const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
- q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ q_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(0, inv_sqrt_embed_dim_per_head);
pd.set(1, 1.f);
@@ -84,15 +84,12 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
q_gemm->load_model(ModelBinFromMatArray(weights));
q_gemm->create_pipeline(opt);
- if (opt.lightmode)
- {
- q_weight_data.release();
- q_bias_data.release();
- }
+ q_weight_data.release();
+ q_bias_data.release();
}
{
- k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ k_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 0); // transA
pd.set(3, 1); // transB
@@ -113,15 +110,12 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
k_gemm->load_model(ModelBinFromMatArray(weights));
k_gemm->create_pipeline(opt);
- if (opt.lightmode)
- {
- k_weight_data.release();
- k_bias_data.release();
- }
+ k_weight_data.release();
+ k_bias_data.release();
}
{
- v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ v_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 0); // transA
pd.set(3, 1); // transB
@@ -142,15 +136,12 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
v_gemm->load_model(ModelBinFromMatArray(weights));
v_gemm->create_pipeline(opt);
- if (opt.lightmode)
- {
- v_weight_data.release();
- v_bias_data.release();
- }
+ v_weight_data.release();
+ v_bias_data.release();
}
{
- o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ o_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 1); // transA
pd.set(3, 1); // transB
@@ -169,15 +160,12 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
o_gemm->load_model(ModelBinFromMatArray(weights));
o_gemm->create_pipeline(opt);
- if (opt.lightmode)
- {
- out_weight_data.release();
- out_bias_data.release();
- }
+ out_weight_data.release();
+ out_bias_data.release();
}
{
- qk_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ qk_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 1); // transA
pd.set(3, 0); // transB
@@ -198,7 +186,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
}
{
- qkv_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ qkv_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 0); // transA
pd.set(3, 1); // transB
diff --git a/src/layer/arm/multiheadattention_arm.h b/src/layer/arm/multiheadattention_arm.h
index fb1010b1b01..f1b721f22ea 100644
--- a/src/layer/arm/multiheadattention_arm.h
+++ b/src/layer/arm/multiheadattention_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class MultiHeadAttention_arm : virtual public MultiHeadAttention
+class MultiHeadAttention_arm : public MultiHeadAttention
{
public:
MultiHeadAttention_arm();
diff --git a/src/layer/arm/packing_arm.h b/src/layer/arm/packing_arm.h
index 20cb04ac5f3..17c64854058 100644
--- a/src/layer/arm/packing_arm.h
+++ b/src/layer/arm/packing_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Packing_arm : virtual public Packing
+class Packing_arm : public Packing
{
public:
Packing_arm();
diff --git a/src/layer/arm/padding_arm.h b/src/layer/arm/padding_arm.h
index 81156fcd831..164cfe4c33a 100644
--- a/src/layer/arm/padding_arm.h
+++ b/src/layer/arm/padding_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Padding_arm : virtual public Padding
+class Padding_arm : public Padding
{
public:
Padding_arm();
diff --git a/src/layer/arm/pixelshuffle_arm.h b/src/layer/arm/pixelshuffle_arm.h
index c40d67ddec8..a2d714c9ebb 100644
--- a/src/layer/arm/pixelshuffle_arm.h
+++ b/src/layer/arm/pixelshuffle_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class PixelShuffle_arm : virtual public PixelShuffle
+class PixelShuffle_arm : public PixelShuffle
{
public:
PixelShuffle_arm();
diff --git a/src/layer/arm/pooling_arm.h b/src/layer/arm/pooling_arm.h
index 0193faa6a87..ead9270c717 100644
--- a/src/layer/arm/pooling_arm.h
+++ b/src/layer/arm/pooling_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Pooling_arm : virtual public Pooling
+class Pooling_arm : public Pooling
{
public:
Pooling_arm();
diff --git a/src/layer/arm/prelu_arm.h b/src/layer/arm/prelu_arm.h
index e65801a3be0..9354be7440b 100644
--- a/src/layer/arm/prelu_arm.h
+++ b/src/layer/arm/prelu_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class PReLU_arm : virtual public PReLU
+class PReLU_arm : public PReLU
{
public:
PReLU_arm();
diff --git a/src/layer/arm/quantize_arm.h b/src/layer/arm/quantize_arm.h
index 3ed271ca7fe..60a716198cb 100644
--- a/src/layer/arm/quantize_arm.h
+++ b/src/layer/arm/quantize_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Quantize_arm : virtual public Quantize
+class Quantize_arm : public Quantize
{
public:
Quantize_arm();
diff --git a/src/layer/arm/relu_arm.h b/src/layer/arm/relu_arm.h
index 77bda6ac5b5..c2212513a42 100644
--- a/src/layer/arm/relu_arm.h
+++ b/src/layer/arm/relu_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ReLU_arm : virtual public ReLU
+class ReLU_arm : public ReLU
{
public:
ReLU_arm();
diff --git a/src/layer/arm/requantize_arm.h b/src/layer/arm/requantize_arm.h
index e7093a7e4c1..c6fc184a018 100644
--- a/src/layer/arm/requantize_arm.h
+++ b/src/layer/arm/requantize_arm.h
@@ -20,7 +20,7 @@
namespace ncnn {
-class Requantize_arm : virtual public Requantize
+class Requantize_arm : public Requantize
{
public:
Requantize_arm();
diff --git a/src/layer/arm/reshape_arm.h b/src/layer/arm/reshape_arm.h
index 7a2474b7cb5..85466ecfd68 100644
--- a/src/layer/arm/reshape_arm.h
+++ b/src/layer/arm/reshape_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Reshape_arm : virtual public Reshape
+class Reshape_arm : public Reshape
{
public:
Reshape_arm();
diff --git a/src/layer/arm/rnn_arm.cpp b/src/layer/arm/rnn_arm.cpp
index 19f439ea2d5..15b9f0b8a0d 100644
--- a/src/layer/arm/rnn_arm.cpp
+++ b/src/layer/arm/rnn_arm.cpp
@@ -139,6 +139,10 @@ int RNN_arm::create_pipeline(const Option& opt)
bias_c_data_packed = bias_c_data;
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
+
return 0;
}
@@ -732,6 +736,10 @@ int RNN_arm::create_pipeline_bf16s(const Option& opt)
cast_float32_to_bfloat16(bias_c_data, bias_c_data_packed, opt);
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
+
return 0;
}
diff --git a/src/layer/arm/rnn_arm.h b/src/layer/arm/rnn_arm.h
index 5defad4cf08..18e75642b9e 100644
--- a/src/layer/arm/rnn_arm.h
+++ b/src/layer/arm/rnn_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class RNN_arm : virtual public RNN
+class RNN_arm : public RNN
{
public:
RNN_arm();
diff --git a/src/layer/arm/rnn_arm_asimdhp.cpp b/src/layer/arm/rnn_arm_asimdhp.cpp
index c34b3e8bb48..467dba614f8 100644
--- a/src/layer/arm/rnn_arm_asimdhp.cpp
+++ b/src/layer/arm/rnn_arm_asimdhp.cpp
@@ -517,6 +517,10 @@ int RNN_arm::create_pipeline_fp16s(const Option& opt)
cast_float32_to_float16(bias_c_data, bias_c_data_packed, opt);
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
+
return 0;
}
diff --git a/src/layer/arm/scale_arm.h b/src/layer/arm/scale_arm.h
index c327376d17e..c540cdd62ed 100644
--- a/src/layer/arm/scale_arm.h
+++ b/src/layer/arm/scale_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Scale_arm : virtual public Scale
+class Scale_arm : public Scale
{
public:
Scale_arm();
diff --git a/src/layer/arm/selu_arm.h b/src/layer/arm/selu_arm.h
index ad0bdf2f955..d951804db68 100644
--- a/src/layer/arm/selu_arm.h
+++ b/src/layer/arm/selu_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class SELU_arm : virtual public SELU
+class SELU_arm : public SELU
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/arm/shufflechannel_arm.h b/src/layer/arm/shufflechannel_arm.h
index f7a32ac4ab7..dcdbf760bb3 100644
--- a/src/layer/arm/shufflechannel_arm.h
+++ b/src/layer/arm/shufflechannel_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ShuffleChannel_arm : virtual public ShuffleChannel
+class ShuffleChannel_arm : public ShuffleChannel
{
public:
ShuffleChannel_arm();
diff --git a/src/layer/arm/sigmoid_arm.h b/src/layer/arm/sigmoid_arm.h
index f532a44d6f5..4c3901abbe9 100644
--- a/src/layer/arm/sigmoid_arm.h
+++ b/src/layer/arm/sigmoid_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Sigmoid_arm : virtual public Sigmoid
+class Sigmoid_arm : public Sigmoid
{
public:
Sigmoid_arm();
diff --git a/src/layer/arm/slice_arm.h b/src/layer/arm/slice_arm.h
index 50da56743b2..c3b558b9e1d 100644
--- a/src/layer/arm/slice_arm.h
+++ b/src/layer/arm/slice_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Slice_arm : virtual public Slice
+class Slice_arm : public Slice
{
public:
Slice_arm();
diff --git a/src/layer/arm/softmax_arm.h b/src/layer/arm/softmax_arm.h
index fced6398c54..78c540845b0 100644
--- a/src/layer/arm/softmax_arm.h
+++ b/src/layer/arm/softmax_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Softmax_arm : virtual public Softmax
+class Softmax_arm : public Softmax
{
public:
Softmax_arm();
diff --git a/src/layer/arm/softmax_arm_asimdhp.cpp b/src/layer/arm/softmax_arm_asimdhp.cpp
index 844e32ce908..3ef14a34acb 100644
--- a/src/layer/arm/softmax_arm_asimdhp.cpp
+++ b/src/layer/arm/softmax_arm_asimdhp.cpp
@@ -255,7 +255,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
float16x8_t _ss01 = vpaddq_f16(_p0, _p1);
float16x8_t _ss23 = vpaddq_f16(_p2, _p3);
float16x8_t _ss2 = vpaddq_f16(_ss01, _ss23);
- _sum = vadd_f16(_sum, vpmax_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
+ _sum = vadd_f16(_sum, vpadd_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
vst1_f16(sumptr, _sum);
ptr += 32;
maxptr += 4;
@@ -292,7 +292,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
vst1q_f16(ptr, _p0);
vst1q_f16(ptr + 8, _p1);
float16x8_t _ss2 = vpaddq_f16(_p0, _p1);
- _sum = vadd_f16(_sum, vpmax_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
+ _sum = vadd_f16(_sum, vpadd_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
vst1_f16(sumptr, _sum);
ptr += 16;
maxptr += 4;
@@ -743,7 +743,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
float16x8_t _ss01 = vpaddq_f16(_p0, _p1);
float16x8_t _ss23 = vpaddq_f16(_p2, _p3);
float16x8_t _ss2 = vpaddq_f16(_ss01, _ss23);
- _sum = vadd_f16(_sum, vpmax_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
+ _sum = vadd_f16(_sum, vpadd_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
vst1_f16(sumptr, _sum);
ptr += 32;
sumptr += 4;
@@ -768,7 +768,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
float16x8_t _p1 = vld1q_f16(ptr + 8);
float16x4_t _sum = vld1_f16(sumptr);
float16x8_t _ss2 = vpaddq_f16(_p0, _p1);
- _sum = vadd_f16(_sum, vpmax_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
+ _sum = vadd_f16(_sum, vpadd_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
vst1_f16(sumptr, _sum);
ptr += 16;
sumptr += 4;
diff --git a/src/layer/arm/swish_arm.h b/src/layer/arm/swish_arm.h
index ac24757c397..907d79708ab 100644
--- a/src/layer/arm/swish_arm.h
+++ b/src/layer/arm/swish_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Swish_arm : virtual public Swish
+class Swish_arm : public Swish
{
public:
Swish_arm();
diff --git a/src/layer/arm/tanh_arm.h b/src/layer/arm/tanh_arm.h
index e019b32ec4f..db62f117a56 100644
--- a/src/layer/arm/tanh_arm.h
+++ b/src/layer/arm/tanh_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class TanH_arm : virtual public TanH
+class TanH_arm : public TanH
{
public:
TanH_arm();
diff --git a/src/layer/arm/unaryop_arm.h b/src/layer/arm/unaryop_arm.h
index 66994eb2103..ab4b23c05f1 100644
--- a/src/layer/arm/unaryop_arm.h
+++ b/src/layer/arm/unaryop_arm.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class UnaryOp_arm : virtual public UnaryOp
+class UnaryOp_arm : public UnaryOp
{
public:
UnaryOp_arm();
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index 4acf91869ae..fe025456f48 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -95,17 +95,9 @@ int Convolution::load_model(const ModelBin& mb)
}
#endif // NCNN_INT8
- return 0;
-}
-
-int Convolution::create_pipeline(const Option& opt)
-{
- if (dynamic_weight)
- return 0;
-
#if NCNN_INT8
// runtime quantize the weight data
- if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
+ if (weight_data.elemsize == (size_t)4u && int8_scale_term)
{
const int maxk = kernel_w * kernel_h;
const int num_input = weight_data_size / num_output / maxk;
@@ -114,7 +106,8 @@ int Convolution::create_pipeline(const Option& opt)
Mat weight_data_int8;
- Option opt_q = opt;
+ Option opt_q;
+ opt_q.num_threads = 1;
opt_q.blob_allocator = weight_data.allocator;
opt_q.use_packing_layout = false;
quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
@@ -123,8 +116,6 @@ int Convolution::create_pipeline(const Option& opt)
weight_data = weight_data_int8.reshape(weight_data_size);
}
-#else
- (void)(opt);
#endif // NCNN_INT8
return 0;
@@ -219,7 +210,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
if (bottom_blob.w * bottom_blob.elempack == num_input)
{
// call InnerProduct
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::InnerProduct);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::InnerProduct);
// set param
ncnn::ParamDict pd;
diff --git a/src/layer/convolution.h b/src/layer/convolution.h
index 476a7aaf67b..7af0735fd30 100644
--- a/src/layer/convolution.h
+++ b/src/layer/convolution.h
@@ -28,8 +28,6 @@ class Convolution : public Layer
virtual int load_model(const ModelBin& mb);
- virtual int create_pipeline(const Option& opt);
-
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const;
diff --git a/src/layer/convolution1d.cpp b/src/layer/convolution1d.cpp
index 184b2bdb60d..7d6be1e111e 100644
--- a/src/layer/convolution1d.cpp
+++ b/src/layer/convolution1d.cpp
@@ -67,14 +67,6 @@ int Convolution1D::load_model(const ModelBin& mb)
return 0;
}
-int Convolution1D::create_pipeline(const Option&)
-{
- if (dynamic_weight)
- return 0;
-
- return 0;
-}
-
static int convolution1d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int stride_w, int dilation_w, int activation_type, const Mat& activation_params, const Option& opt)
{
const int h = bottom_blob.h;
diff --git a/src/layer/convolution1d.h b/src/layer/convolution1d.h
index e30807e5c9b..d87099e25f2 100644
--- a/src/layer/convolution1d.h
+++ b/src/layer/convolution1d.h
@@ -28,8 +28,6 @@ class Convolution1D : public Layer
virtual int load_model(const ModelBin& mb);
- virtual int create_pipeline(const Option& opt);
-
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const;
diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp
index e820a192cb3..fb8e1e5c0b2 100644
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -124,14 +124,9 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
}
#endif // NCNN_INT8
- return 0;
-}
-
-int ConvolutionDepthWise::create_pipeline(const Option& opt)
-{
#if NCNN_INT8
// runtime quantize the weight data
- if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
+ if (weight_data.elemsize == (size_t)4u && int8_scale_term)
{
Mat int8_weight_data(weight_data_size, (size_t)1u);
if (int8_weight_data.empty())
@@ -141,7 +136,8 @@ int ConvolutionDepthWise::create_pipeline(const Option& opt)
for (int g = 0; g < group; g++)
{
- Option opt_q = opt;
+ Option opt_q;
+ opt_q.num_threads = 1;
opt_q.blob_allocator = int8_weight_data.allocator;
opt_q.use_packing_layout = false;
@@ -153,8 +149,6 @@ int ConvolutionDepthWise::create_pipeline(const Option& opt)
weight_data = int8_weight_data;
}
-#else
- (void)(opt);
#endif // NCNN_INT8
return 0;
diff --git a/src/layer/convolutiondepthwise.h b/src/layer/convolutiondepthwise.h
index e893aa07fc9..8a955dbd23b 100644
--- a/src/layer/convolutiondepthwise.h
+++ b/src/layer/convolutiondepthwise.h
@@ -28,8 +28,6 @@ class ConvolutionDepthWise : public Layer
virtual int load_model(const ModelBin& mb);
- virtual int create_pipeline(const Option& opt);
-
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const;
diff --git a/src/layer/convolutiondepthwise1d.cpp b/src/layer/convolutiondepthwise1d.cpp
index 79c83168051..2ace80658e7 100644
--- a/src/layer/convolutiondepthwise1d.cpp
+++ b/src/layer/convolutiondepthwise1d.cpp
@@ -59,6 +59,9 @@ int ConvolutionDepthWise1D::load_param(const ParamDict& pd)
int ConvolutionDepthWise1D::load_model(const ModelBin& mb)
{
+ if (dynamic_weight)
+ return 0;
+
weight_data = mb.load(weight_data_size, 0);
if (weight_data.empty())
return -100;
@@ -73,11 +76,6 @@ int ConvolutionDepthWise1D::load_model(const ModelBin& mb)
return 0;
}
-int ConvolutionDepthWise1D::create_pipeline(const Option&)
-{
- return 0;
-}
-
static int convolutiondepthwise1d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int stride_w, int dilation_w, int group, int activation_type, const Mat& activation_params, const Option& opt)
{
const int h = bottom_blob.h;
diff --git a/src/layer/convolutiondepthwise1d.h b/src/layer/convolutiondepthwise1d.h
index e2c195dc489..6026f04981d 100644
--- a/src/layer/convolutiondepthwise1d.h
+++ b/src/layer/convolutiondepthwise1d.h
@@ -28,8 +28,6 @@ class ConvolutionDepthWise1D : public Layer
virtual int load_model(const ModelBin& mb);
- virtual int create_pipeline(const Option& opt);
-
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const;
diff --git a/src/layer/fused_activation.h b/src/layer/fused_activation.h
index 275fd9e2f9a..d5919257792 100644
--- a/src/layer/fused_activation.h
+++ b/src/layer/fused_activation.h
@@ -80,14 +80,14 @@ static ncnn::Layer* create_activation_layer(int activation_type, const ncnn::Mat
if (activation_type == 1)
{
- activation = ncnn::create_layer(ncnn::LayerType::ReLU);
+ activation = ncnn::create_layer_cpu(ncnn::LayerType::ReLU);
ncnn::ParamDict pd;
activation->load_param(pd);
}
else if (activation_type == 2)
{
- activation = ncnn::create_layer(ncnn::LayerType::ReLU);
+ activation = ncnn::create_layer_cpu(ncnn::LayerType::ReLU);
ncnn::ParamDict pd;
pd.set(0, activation_params[0]); // slope
@@ -95,7 +95,7 @@ static ncnn::Layer* create_activation_layer(int activation_type, const ncnn::Mat
}
else if (activation_type == 3)
{
- activation = ncnn::create_layer(ncnn::LayerType::Clip);
+ activation = ncnn::create_layer_cpu(ncnn::LayerType::Clip);
ncnn::ParamDict pd;
pd.set(0, activation_params[0]); // min
@@ -105,21 +105,21 @@ static ncnn::Layer* create_activation_layer(int activation_type, const ncnn::Mat
}
else if (activation_type == 4)
{
- activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
+ activation = ncnn::create_layer_cpu(ncnn::LayerType::Sigmoid);
ncnn::ParamDict pd;
activation->load_param(pd);
}
else if (activation_type == 5)
{
- activation = ncnn::create_layer(ncnn::LayerType::Mish);
+ activation = ncnn::create_layer_cpu(ncnn::LayerType::Mish);
ncnn::ParamDict pd;
activation->load_param(pd);
}
else if (activation_type == 6)
{
- activation = ncnn::create_layer(ncnn::LayerType::HardSwish);
+ activation = ncnn::create_layer_cpu(ncnn::LayerType::HardSwish);
ncnn::ParamDict pd;
pd.set(0, activation_params[0]); // alpha
diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp
index 4cc22981c34..9cb422d21b6 100644
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -69,21 +69,17 @@ int InnerProduct::load_model(const ModelBin& mb)
}
#endif // NCNN_INT8
- return 0;
-}
-
-int InnerProduct::create_pipeline(const Option& opt)
-{
#if NCNN_INT8
// runtime quantize the weight data
- if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
+ if (weight_data.elemsize == (size_t)4u && int8_scale_term)
{
const int num_input = weight_data_size / num_output;
Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
Mat weight_data_int8;
- Option opt_q = opt;
+ Option opt_q;
+ opt_q.num_threads = 1;
opt_q.use_packing_layout = false;
quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
if (weight_data_int8.empty())
@@ -91,8 +87,6 @@ int InnerProduct::create_pipeline(const Option& opt)
weight_data = weight_data_int8.reshape(weight_data_size);
}
-#else
- (void)(opt);
#endif // NCNN_INT8
return 0;
diff --git a/src/layer/innerproduct.h b/src/layer/innerproduct.h
index 1f9b3fdc0a5..becf7b1d01a 100644
--- a/src/layer/innerproduct.h
+++ b/src/layer/innerproduct.h
@@ -28,8 +28,6 @@ class InnerProduct : public Layer
virtual int load_model(const ModelBin& mb);
- virtual int create_pipeline(const Option& opt);
-
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
protected:
diff --git a/src/layer/loongarch/absval_loongarch.h b/src/layer/loongarch/absval_loongarch.h
index 0a3143cea43..855f959cf00 100644
--- a/src/layer/loongarch/absval_loongarch.h
+++ b/src/layer/loongarch/absval_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class AbsVal_loongarch : virtual public AbsVal
+class AbsVal_loongarch : public AbsVal
{
public:
AbsVal_loongarch();
diff --git a/src/layer/loongarch/batchnorm_loongarch.h b/src/layer/loongarch/batchnorm_loongarch.h
index 8b38d5e1f66..fb477a9aedb 100644
--- a/src/layer/loongarch/batchnorm_loongarch.h
+++ b/src/layer/loongarch/batchnorm_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BatchNorm_loongarch : virtual public BatchNorm
+class BatchNorm_loongarch : public BatchNorm
{
public:
BatchNorm_loongarch();
diff --git a/src/layer/loongarch/bias_loongarch.h b/src/layer/loongarch/bias_loongarch.h
index f122ffa0dd9..35824997487 100644
--- a/src/layer/loongarch/bias_loongarch.h
+++ b/src/layer/loongarch/bias_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Bias_loongarch : virtual public Bias
+class Bias_loongarch : public Bias
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/loongarch/binaryop_loongarch.h b/src/layer/loongarch/binaryop_loongarch.h
index bcf9ef5442f..2fc401ad610 100644
--- a/src/layer/loongarch/binaryop_loongarch.h
+++ b/src/layer/loongarch/binaryop_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BinaryOp_loongarch : virtual public BinaryOp
+class BinaryOp_loongarch : public BinaryOp
{
public:
BinaryOp_loongarch();
diff --git a/src/layer/loongarch/cast_loongarch.h b/src/layer/loongarch/cast_loongarch.h
index 1fe75c687d8..8925f242ed5 100644
--- a/src/layer/loongarch/cast_loongarch.h
+++ b/src/layer/loongarch/cast_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Cast_loongarch : virtual public Cast
+class Cast_loongarch : public Cast
{
public:
Cast_loongarch();
diff --git a/src/layer/loongarch/clip_loongarch.h b/src/layer/loongarch/clip_loongarch.h
index 43df62035ff..1ebeee2aeac 100644
--- a/src/layer/loongarch/clip_loongarch.h
+++ b/src/layer/loongarch/clip_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Clip_loongarch : virtual public Clip
+class Clip_loongarch : public Clip
{
public:
Clip_loongarch();
diff --git a/src/layer/loongarch/concat_loongarch.h b/src/layer/loongarch/concat_loongarch.h
index 934c85244df..91b32ef2faf 100644
--- a/src/layer/loongarch/concat_loongarch.h
+++ b/src/layer/loongarch/concat_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Concat_loongarch : virtual public Concat
+class Concat_loongarch : public Concat
{
public:
Concat_loongarch();
diff --git a/src/layer/loongarch/convolution1d_loongarch.cpp b/src/layer/loongarch/convolution1d_loongarch.cpp
index 0b1a11c868f..0917a79f62e 100644
--- a/src/layer/loongarch/convolution1d_loongarch.cpp
+++ b/src/layer/loongarch/convolution1d_loongarch.cpp
@@ -342,7 +342,7 @@ int Convolution1D_loongarch::forward(const std::vector& bottom_blobs, std::
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
ncnn::ParamDict pd;
pd.set(0, _num_output);
diff --git a/src/layer/loongarch/convolution1d_loongarch.h b/src/layer/loongarch/convolution1d_loongarch.h
index 36393df4568..922fae598f4 100644
--- a/src/layer/loongarch/convolution1d_loongarch.h
+++ b/src/layer/loongarch/convolution1d_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Convolution1D_loongarch : virtual public Convolution1D
+class Convolution1D_loongarch : public Convolution1D
{
public:
Convolution1D_loongarch();
diff --git a/src/layer/loongarch/convolution_loongarch.cpp b/src/layer/loongarch/convolution_loongarch.cpp
index 7816d1c66d2..3c5d0c1a424 100644
--- a/src/layer/loongarch/convolution_loongarch.cpp
+++ b/src/layer/loongarch/convolution_loongarch.cpp
@@ -225,10 +225,7 @@ int Convolution_loongarch::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -593,7 +590,7 @@ int Convolution_loongarch::forward(const std::vector& bottom_blobs, std::ve
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -792,10 +789,7 @@ int Convolution_loongarch::create_pipeline_int8_loongarch(const Option& opt)
scale_in_data[p] = scale_in;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/loongarch/convolution_loongarch.h b/src/layer/loongarch/convolution_loongarch.h
index a84281bf713..7807f43f9f1 100644
--- a/src/layer/loongarch/convolution_loongarch.h
+++ b/src/layer/loongarch/convolution_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Convolution_loongarch : virtual public Convolution
+class Convolution_loongarch : public Convolution
{
public:
Convolution_loongarch();
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
index 4d134cc4a39..0c5050dbce0 100644
--- a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
@@ -83,10 +83,7 @@ int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
weight_data_tm = weight_data;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -94,10 +91,7 @@ int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -125,7 +119,7 @@ int ConvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
// set param
ncnn::ParamDict pd;
@@ -537,7 +531,7 @@ int ConvolutionDepthWise_loongarch::forward(const std::vector& bottom_blobs
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -606,16 +600,15 @@ int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option&
weight_data_tm = weight_data;
}
+ weight_data.release();
+
return 0;
}
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.h b/src/layer/loongarch/convolutiondepthwise_loongarch.h
index 554fe764304..35cdd8f008d 100644
--- a/src/layer/loongarch/convolutiondepthwise_loongarch.h
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ConvolutionDepthWise_loongarch : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_loongarch : public ConvolutionDepthWise
{
public:
ConvolutionDepthWise_loongarch();
diff --git a/src/layer/loongarch/crop_loongarch.h b/src/layer/loongarch/crop_loongarch.h
index 0ba460256d6..cfb4ff352ba 100644
--- a/src/layer/loongarch/crop_loongarch.h
+++ b/src/layer/loongarch/crop_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Crop_loongarch : virtual public Crop
+class Crop_loongarch : public Crop
{
public:
Crop_loongarch();
diff --git a/src/layer/loongarch/deconvolution_loongarch.cpp b/src/layer/loongarch/deconvolution_loongarch.cpp
index 2d934bccb06..62b9d872b60 100644
--- a/src/layer/loongarch/deconvolution_loongarch.cpp
+++ b/src/layer/loongarch/deconvolution_loongarch.cpp
@@ -126,10 +126,7 @@ int Deconvolution_loongarch::create_pipeline(const Option& opt)
{
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -348,7 +345,7 @@ int Deconvolution_loongarch::forward(const std::vector& bottom_blobs, std::
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
ncnn::ParamDict pd;
pd.set(0, _num_output);
diff --git a/src/layer/loongarch/deconvolution_loongarch.h b/src/layer/loongarch/deconvolution_loongarch.h
index f67b5d7e4e1..00ddf67e05b 100644
--- a/src/layer/loongarch/deconvolution_loongarch.h
+++ b/src/layer/loongarch/deconvolution_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Deconvolution_loongarch : virtual public Deconvolution
+class Deconvolution_loongarch : public Deconvolution
{
public:
Deconvolution_loongarch();
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
index f4f4d37bf7f..9495a99aae0 100644
--- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
@@ -82,16 +82,15 @@ int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
weight_data_tm = weight_data_transposed;
}
+ weight_data.release();
+
return 0;
}
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -119,7 +118,7 @@ int DeconvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
// set param
ncnn::ParamDict pd;
@@ -476,7 +475,7 @@ int DeconvolutionDepthWise_loongarch::forward(const std::vector& bottom_blo
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
ncnn::ParamDict pd;
pd.set(0, _num_output);
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
index b710f07ecf3..87c5351fab4 100644
--- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class DeconvolutionDepthWise_loongarch : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_loongarch : public DeconvolutionDepthWise
{
public:
DeconvolutionDepthWise_loongarch();
diff --git a/src/layer/loongarch/dequantize_loongarch.h b/src/layer/loongarch/dequantize_loongarch.h
index 61a408d5c50..ae7d3fe6479 100644
--- a/src/layer/loongarch/dequantize_loongarch.h
+++ b/src/layer/loongarch/dequantize_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Dequantize_loongarch : virtual public Dequantize
+class Dequantize_loongarch : public Dequantize
{
public:
Dequantize_loongarch();
diff --git a/src/layer/loongarch/dropout_loongarch.h b/src/layer/loongarch/dropout_loongarch.h
index 42810050677..f9beff05034 100644
--- a/src/layer/loongarch/dropout_loongarch.h
+++ b/src/layer/loongarch/dropout_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Dropout_loongarch : virtual public Dropout
+class Dropout_loongarch : public Dropout
{
public:
Dropout_loongarch();
diff --git a/src/layer/loongarch/eltwise_loongarch.h b/src/layer/loongarch/eltwise_loongarch.h
index f9715b20cad..f523132bb5f 100644
--- a/src/layer/loongarch/eltwise_loongarch.h
+++ b/src/layer/loongarch/eltwise_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Eltwise_loongarch : virtual public Eltwise
+class Eltwise_loongarch : public Eltwise
{
public:
Eltwise_loongarch();
diff --git a/src/layer/loongarch/flatten_loongarch.h b/src/layer/loongarch/flatten_loongarch.h
index afd35c701f5..da75fd12f3f 100644
--- a/src/layer/loongarch/flatten_loongarch.h
+++ b/src/layer/loongarch/flatten_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Flatten_loongarch : virtual public Flatten
+class Flatten_loongarch : public Flatten
{
public:
Flatten_loongarch();
diff --git a/src/layer/loongarch/hardsigmoid_loongarch.h b/src/layer/loongarch/hardsigmoid_loongarch.h
index 755ae89ff03..519a4ba9594 100644
--- a/src/layer/loongarch/hardsigmoid_loongarch.h
+++ b/src/layer/loongarch/hardsigmoid_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSigmoid_loongarch : virtual public HardSigmoid
+class HardSigmoid_loongarch : public HardSigmoid
{
public:
HardSigmoid_loongarch();
diff --git a/src/layer/loongarch/hardswish_loongarch.h b/src/layer/loongarch/hardswish_loongarch.h
index e9b0821245c..ef69cb05417 100644
--- a/src/layer/loongarch/hardswish_loongarch.h
+++ b/src/layer/loongarch/hardswish_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSwish_loongarch : virtual public HardSwish
+class HardSwish_loongarch : public HardSwish
{
public:
HardSwish_loongarch();
diff --git a/src/layer/loongarch/innerproduct_loongarch.cpp b/src/layer/loongarch/innerproduct_loongarch.cpp
index 34e908fc11a..e6b8eb0936b 100644
--- a/src/layer/loongarch/innerproduct_loongarch.cpp
+++ b/src/layer/loongarch/innerproduct_loongarch.cpp
@@ -37,7 +37,7 @@ InnerProduct_loongarch::InnerProduct_loongarch()
int InnerProduct_loongarch::create_pipeline(const Option& opt)
{
{
- flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+ flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
ncnn::ParamDict pd;
@@ -99,10 +99,7 @@ int InnerProduct_loongarch::create_pipeline(const Option& opt)
weight_data_tm = weight_data;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -655,10 +652,7 @@ int InnerProduct_loongarch::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -1146,10 +1140,7 @@ int InnerProduct_loongarch::create_pipeline_int8_loongarch(const Option& opt)
scale_in_data[p] = scale_in;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/loongarch/innerproduct_loongarch.h b/src/layer/loongarch/innerproduct_loongarch.h
index 4d9574ce919..2ae1a1e57e0 100644
--- a/src/layer/loongarch/innerproduct_loongarch.h
+++ b/src/layer/loongarch/innerproduct_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class InnerProduct_loongarch : virtual public InnerProduct
+class InnerProduct_loongarch : public InnerProduct
{
public:
InnerProduct_loongarch();
diff --git a/src/layer/loongarch/interp_loongarch.h b/src/layer/loongarch/interp_loongarch.h
index 4c0e0f3dc86..f1fa80705d5 100644
--- a/src/layer/loongarch/interp_loongarch.h
+++ b/src/layer/loongarch/interp_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Interp_loongarch : virtual public Interp
+class Interp_loongarch : public Interp
{
public:
Interp_loongarch();
diff --git a/src/layer/loongarch/mish_loongarch.h b/src/layer/loongarch/mish_loongarch.h
index 97c6f0520f5..0c796758064 100644
--- a/src/layer/loongarch/mish_loongarch.h
+++ b/src/layer/loongarch/mish_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Mish_loongarch : virtual public Mish
+class Mish_loongarch : public Mish
{
public:
Mish_loongarch();
diff --git a/src/layer/loongarch/packing_loongarch.h b/src/layer/loongarch/packing_loongarch.h
index 1db215cfee7..476ebd33a87 100644
--- a/src/layer/loongarch/packing_loongarch.h
+++ b/src/layer/loongarch/packing_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Packing_loongarch : virtual public Packing
+class Packing_loongarch : public Packing
{
public:
Packing_loongarch();
diff --git a/src/layer/loongarch/padding_loongarch.h b/src/layer/loongarch/padding_loongarch.h
index 137fbc4459e..de416464783 100644
--- a/src/layer/loongarch/padding_loongarch.h
+++ b/src/layer/loongarch/padding_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Padding_loongarch : virtual public Padding
+class Padding_loongarch : public Padding
{
public:
Padding_loongarch();
diff --git a/src/layer/loongarch/pooling_loongarch.h b/src/layer/loongarch/pooling_loongarch.h
index 97e0c9ff2f7..646b10947b3 100644
--- a/src/layer/loongarch/pooling_loongarch.h
+++ b/src/layer/loongarch/pooling_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Pooling_loongarch : virtual public Pooling
+class Pooling_loongarch : public Pooling
{
public:
Pooling_loongarch();
diff --git a/src/layer/loongarch/prelu_loongarch.h b/src/layer/loongarch/prelu_loongarch.h
index 97031bb0601..bafd7ac4c68 100644
--- a/src/layer/loongarch/prelu_loongarch.h
+++ b/src/layer/loongarch/prelu_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class PReLU_loongarch : virtual public PReLU
+class PReLU_loongarch : public PReLU
{
public:
PReLU_loongarch();
diff --git a/src/layer/loongarch/quantize_loongarch.h b/src/layer/loongarch/quantize_loongarch.h
index cae04aab171..dcc0d8e097e 100644
--- a/src/layer/loongarch/quantize_loongarch.h
+++ b/src/layer/loongarch/quantize_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Quantize_loongarch : virtual public Quantize
+class Quantize_loongarch : public Quantize
{
public:
Quantize_loongarch();
diff --git a/src/layer/loongarch/relu_loongarch.h b/src/layer/loongarch/relu_loongarch.h
index 445c6e8febc..6ee6684fdb7 100644
--- a/src/layer/loongarch/relu_loongarch.h
+++ b/src/layer/loongarch/relu_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ReLU_loongarch : virtual public ReLU
+class ReLU_loongarch : public ReLU
{
public:
ReLU_loongarch();
diff --git a/src/layer/loongarch/requantize_loongarch.h b/src/layer/loongarch/requantize_loongarch.h
index 8175989959e..4afaf9df3d3 100644
--- a/src/layer/loongarch/requantize_loongarch.h
+++ b/src/layer/loongarch/requantize_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Requantize_loongarch : virtual public Requantize
+class Requantize_loongarch : public Requantize
{
public:
Requantize_loongarch();
diff --git a/src/layer/loongarch/sigmoid_loongarch.h b/src/layer/loongarch/sigmoid_loongarch.h
index b15aad235db..02354d2a5a4 100644
--- a/src/layer/loongarch/sigmoid_loongarch.h
+++ b/src/layer/loongarch/sigmoid_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Sigmoid_loongarch : virtual public Sigmoid
+class Sigmoid_loongarch : public Sigmoid
{
public:
Sigmoid_loongarch();
diff --git a/src/layer/loongarch/slice_loongarch.h b/src/layer/loongarch/slice_loongarch.h
index b42138ba418..2f5faed8cbf 100644
--- a/src/layer/loongarch/slice_loongarch.h
+++ b/src/layer/loongarch/slice_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Slice_loongarch : virtual public Slice
+class Slice_loongarch : public Slice
{
public:
Slice_loongarch();
diff --git a/src/layer/loongarch/softmax_loongarch.h b/src/layer/loongarch/softmax_loongarch.h
index 3c8272a6412..baf930fcbd2 100644
--- a/src/layer/loongarch/softmax_loongarch.h
+++ b/src/layer/loongarch/softmax_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Softmax_loongarch : virtual public Softmax
+class Softmax_loongarch : public Softmax
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/loongarch/swish_loongarch.h b/src/layer/loongarch/swish_loongarch.h
index b8d0b80f01e..9b7d2ac851f 100644
--- a/src/layer/loongarch/swish_loongarch.h
+++ b/src/layer/loongarch/swish_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Swish_loongarch : virtual public Swish
+class Swish_loongarch : public Swish
{
public:
Swish_loongarch();
diff --git a/src/layer/loongarch/tanh_loongarch.h b/src/layer/loongarch/tanh_loongarch.h
index ecbab01ec8f..74231eb56b6 100644
--- a/src/layer/loongarch/tanh_loongarch.h
+++ b/src/layer/loongarch/tanh_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class TanH_loongarch : virtual public TanH
+class TanH_loongarch : public TanH
{
public:
TanH_loongarch();
diff --git a/src/layer/loongarch/unaryop_loongarch.h b/src/layer/loongarch/unaryop_loongarch.h
index 8170bec50cf..f4210aeab57 100644
--- a/src/layer/loongarch/unaryop_loongarch.h
+++ b/src/layer/loongarch/unaryop_loongarch.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class UnaryOp_loongarch : virtual public UnaryOp
+class UnaryOp_loongarch : public UnaryOp
{
public:
UnaryOp_loongarch();
diff --git a/src/layer/mips/absval_mips.h b/src/layer/mips/absval_mips.h
index c028c312f35..95dca4d596a 100644
--- a/src/layer/mips/absval_mips.h
+++ b/src/layer/mips/absval_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class AbsVal_mips : virtual public AbsVal
+class AbsVal_mips : public AbsVal
{
public:
AbsVal_mips();
diff --git a/src/layer/mips/batchnorm_mips.h b/src/layer/mips/batchnorm_mips.h
index c18902ebad7..6df49407a0e 100644
--- a/src/layer/mips/batchnorm_mips.h
+++ b/src/layer/mips/batchnorm_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BatchNorm_mips : virtual public BatchNorm
+class BatchNorm_mips : public BatchNorm
{
public:
BatchNorm_mips();
diff --git a/src/layer/mips/bias_mips.h b/src/layer/mips/bias_mips.h
index 3757c0b421e..dfef2159b4d 100644
--- a/src/layer/mips/bias_mips.h
+++ b/src/layer/mips/bias_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Bias_mips : virtual public Bias
+class Bias_mips : public Bias
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/mips/binaryop_mips.h b/src/layer/mips/binaryop_mips.h
index 55d0f2cf363..e682373ba56 100644
--- a/src/layer/mips/binaryop_mips.h
+++ b/src/layer/mips/binaryop_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BinaryOp_mips : virtual public BinaryOp
+class BinaryOp_mips : public BinaryOp
{
public:
BinaryOp_mips();
diff --git a/src/layer/mips/cast_mips.h b/src/layer/mips/cast_mips.h
index e37374bda6c..adabee5f888 100644
--- a/src/layer/mips/cast_mips.h
+++ b/src/layer/mips/cast_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Cast_mips : virtual public Cast
+class Cast_mips : public Cast
{
public:
Cast_mips();
diff --git a/src/layer/mips/clip_mips.h b/src/layer/mips/clip_mips.h
index 951888e0562..5db94bc5454 100644
--- a/src/layer/mips/clip_mips.h
+++ b/src/layer/mips/clip_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Clip_mips : virtual public Clip
+class Clip_mips : public Clip
{
public:
Clip_mips();
diff --git a/src/layer/mips/concat_mips.h b/src/layer/mips/concat_mips.h
index 994ca85cf3b..c4ab84f3037 100644
--- a/src/layer/mips/concat_mips.h
+++ b/src/layer/mips/concat_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Concat_mips : virtual public Concat
+class Concat_mips : public Concat
{
public:
Concat_mips();
diff --git a/src/layer/mips/convolution1d_mips.cpp b/src/layer/mips/convolution1d_mips.cpp
index fc61c940687..e9cf211e49b 100644
--- a/src/layer/mips/convolution1d_mips.cpp
+++ b/src/layer/mips/convolution1d_mips.cpp
@@ -342,7 +342,7 @@ int Convolution1D_mips::forward(const std::vector& bottom_blobs, std::vecto
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
ncnn::ParamDict pd;
pd.set(0, _num_output);
diff --git a/src/layer/mips/convolution1d_mips.h b/src/layer/mips/convolution1d_mips.h
index 13e66e4f36c..dcc9bd4de4a 100644
--- a/src/layer/mips/convolution1d_mips.h
+++ b/src/layer/mips/convolution1d_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Convolution1D_mips : virtual public Convolution1D
+class Convolution1D_mips : public Convolution1D
{
public:
Convolution1D_mips();
diff --git a/src/layer/mips/convolution_mips.cpp b/src/layer/mips/convolution_mips.cpp
index bc547e4a667..af420e61a9a 100644
--- a/src/layer/mips/convolution_mips.cpp
+++ b/src/layer/mips/convolution_mips.cpp
@@ -225,10 +225,7 @@ int Convolution_mips::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -593,7 +590,7 @@ int Convolution_mips::forward(const std::vector& bottom_blobs, std::vector<
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -792,10 +789,7 @@ int Convolution_mips::create_pipeline_int8_mips(const Option& opt)
scale_in_data[p] = scale_in;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/mips/convolution_mips.h b/src/layer/mips/convolution_mips.h
index e8fe54f87a2..8401c6dfd51 100644
--- a/src/layer/mips/convolution_mips.h
+++ b/src/layer/mips/convolution_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Convolution_mips : virtual public Convolution
+class Convolution_mips : public Convolution
{
public:
Convolution_mips();
diff --git a/src/layer/mips/convolutiondepthwise_mips.cpp b/src/layer/mips/convolutiondepthwise_mips.cpp
index 991cb07872d..0c9bdca30ce 100644
--- a/src/layer/mips/convolutiondepthwise_mips.cpp
+++ b/src/layer/mips/convolutiondepthwise_mips.cpp
@@ -83,10 +83,7 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt)
weight_data_tm = weight_data;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -94,10 +91,7 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt)
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -125,7 +119,7 @@ int ConvolutionDepthWise_mips::create_group_ops(const Option& opt)
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
// set param
ncnn::ParamDict pd;
@@ -537,7 +531,7 @@ int ConvolutionDepthWise_mips::forward(const std::vector& bottom_blobs, std
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -606,16 +600,15 @@ int ConvolutionDepthWise_mips::create_pipeline_int8_mips(const Option& opt)
weight_data_tm = weight_data;
}
+ weight_data.release();
+
return 0;
}
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/mips/convolutiondepthwise_mips.h b/src/layer/mips/convolutiondepthwise_mips.h
index 9d28009b8a1..24d1650b0c0 100644
--- a/src/layer/mips/convolutiondepthwise_mips.h
+++ b/src/layer/mips/convolutiondepthwise_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ConvolutionDepthWise_mips : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_mips : public ConvolutionDepthWise
{
public:
ConvolutionDepthWise_mips();
diff --git a/src/layer/mips/crop_mips.h b/src/layer/mips/crop_mips.h
index e61c73a44d1..77c077e7153 100644
--- a/src/layer/mips/crop_mips.h
+++ b/src/layer/mips/crop_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Crop_mips : virtual public Crop
+class Crop_mips : public Crop
{
public:
Crop_mips();
diff --git a/src/layer/mips/deconvolution_mips.cpp b/src/layer/mips/deconvolution_mips.cpp
index 506d3072096..208400f532e 100644
--- a/src/layer/mips/deconvolution_mips.cpp
+++ b/src/layer/mips/deconvolution_mips.cpp
@@ -126,10 +126,7 @@ int Deconvolution_mips::create_pipeline(const Option& opt)
{
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -348,7 +345,7 @@ int Deconvolution_mips::forward(const std::vector& bottom_blobs, std::vecto
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
ncnn::ParamDict pd;
pd.set(0, _num_output);
diff --git a/src/layer/mips/deconvolution_mips.h b/src/layer/mips/deconvolution_mips.h
index 218bd812672..b7c0d2e7578 100644
--- a/src/layer/mips/deconvolution_mips.h
+++ b/src/layer/mips/deconvolution_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Deconvolution_mips : virtual public Deconvolution
+class Deconvolution_mips : public Deconvolution
{
public:
Deconvolution_mips();
diff --git a/src/layer/mips/deconvolutiondepthwise_mips.cpp b/src/layer/mips/deconvolutiondepthwise_mips.cpp
index 533bf522ad9..e6f5dd43478 100644
--- a/src/layer/mips/deconvolutiondepthwise_mips.cpp
+++ b/src/layer/mips/deconvolutiondepthwise_mips.cpp
@@ -82,16 +82,15 @@ int DeconvolutionDepthWise_mips::create_pipeline(const Option& opt)
weight_data_tm = weight_data_transposed;
}
+ weight_data.release();
+
return 0;
}
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -119,7 +118,7 @@ int DeconvolutionDepthWise_mips::create_group_ops(const Option& opt)
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
// set param
ncnn::ParamDict pd;
@@ -476,7 +475,7 @@ int DeconvolutionDepthWise_mips::forward(const std::vector& bottom_blobs, s
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
ncnn::ParamDict pd;
pd.set(0, _num_output);
diff --git a/src/layer/mips/deconvolutiondepthwise_mips.h b/src/layer/mips/deconvolutiondepthwise_mips.h
index a033d7c11c3..24e7a481edf 100644
--- a/src/layer/mips/deconvolutiondepthwise_mips.h
+++ b/src/layer/mips/deconvolutiondepthwise_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class DeconvolutionDepthWise_mips : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_mips : public DeconvolutionDepthWise
{
public:
DeconvolutionDepthWise_mips();
diff --git a/src/layer/mips/dequantize_mips.h b/src/layer/mips/dequantize_mips.h
index 09623e20d4f..8ae7e542c12 100644
--- a/src/layer/mips/dequantize_mips.h
+++ b/src/layer/mips/dequantize_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Dequantize_mips : virtual public Dequantize
+class Dequantize_mips : public Dequantize
{
public:
Dequantize_mips();
diff --git a/src/layer/mips/dropout_mips.h b/src/layer/mips/dropout_mips.h
index a5a4dbebb90..05fa38463d7 100644
--- a/src/layer/mips/dropout_mips.h
+++ b/src/layer/mips/dropout_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Dropout_mips : virtual public Dropout
+class Dropout_mips : public Dropout
{
public:
Dropout_mips();
diff --git a/src/layer/mips/eltwise_mips.h b/src/layer/mips/eltwise_mips.h
index 55252ec661d..9b4ac77319f 100644
--- a/src/layer/mips/eltwise_mips.h
+++ b/src/layer/mips/eltwise_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Eltwise_mips : virtual public Eltwise
+class Eltwise_mips : public Eltwise
{
public:
Eltwise_mips();
diff --git a/src/layer/mips/flatten_mips.h b/src/layer/mips/flatten_mips.h
index 725ceda6431..c9f33225f98 100644
--- a/src/layer/mips/flatten_mips.h
+++ b/src/layer/mips/flatten_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Flatten_mips : virtual public Flatten
+class Flatten_mips : public Flatten
{
public:
Flatten_mips();
diff --git a/src/layer/mips/hardsigmoid_mips.h b/src/layer/mips/hardsigmoid_mips.h
index a1ce9986eca..51cab82627f 100644
--- a/src/layer/mips/hardsigmoid_mips.h
+++ b/src/layer/mips/hardsigmoid_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSigmoid_mips : virtual public HardSigmoid
+class HardSigmoid_mips : public HardSigmoid
{
public:
HardSigmoid_mips();
diff --git a/src/layer/mips/hardswish_mips.h b/src/layer/mips/hardswish_mips.h
index 692cf22eac2..8ace7fe79f5 100644
--- a/src/layer/mips/hardswish_mips.h
+++ b/src/layer/mips/hardswish_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSwish_mips : virtual public HardSwish
+class HardSwish_mips : public HardSwish
{
public:
HardSwish_mips();
diff --git a/src/layer/mips/innerproduct_mips.cpp b/src/layer/mips/innerproduct_mips.cpp
index b064a20e522..9d926bfd08d 100644
--- a/src/layer/mips/innerproduct_mips.cpp
+++ b/src/layer/mips/innerproduct_mips.cpp
@@ -37,7 +37,7 @@ InnerProduct_mips::InnerProduct_mips()
int InnerProduct_mips::create_pipeline(const Option& opt)
{
{
- flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+ flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
ncnn::ParamDict pd;
@@ -99,10 +99,7 @@ int InnerProduct_mips::create_pipeline(const Option& opt)
weight_data_tm = weight_data;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -655,10 +652,7 @@ int InnerProduct_mips::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -1146,10 +1140,7 @@ int InnerProduct_mips::create_pipeline_int8_mips(const Option& opt)
scale_in_data[p] = scale_in;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/mips/innerproduct_mips.h b/src/layer/mips/innerproduct_mips.h
index 59b26c53627..c96db3f93d1 100644
--- a/src/layer/mips/innerproduct_mips.h
+++ b/src/layer/mips/innerproduct_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class InnerProduct_mips : virtual public InnerProduct
+class InnerProduct_mips : public InnerProduct
{
public:
InnerProduct_mips();
diff --git a/src/layer/mips/interp_mips.h b/src/layer/mips/interp_mips.h
index c15b4990cde..baff10b4e38 100644
--- a/src/layer/mips/interp_mips.h
+++ b/src/layer/mips/interp_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Interp_mips : virtual public Interp
+class Interp_mips : public Interp
{
public:
Interp_mips();
diff --git a/src/layer/mips/mish_mips.h b/src/layer/mips/mish_mips.h
index 68cc9ff6f0f..33342a4f5d3 100644
--- a/src/layer/mips/mish_mips.h
+++ b/src/layer/mips/mish_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Mish_mips : virtual public Mish
+class Mish_mips : public Mish
{
public:
Mish_mips();
diff --git a/src/layer/mips/packing_mips.h b/src/layer/mips/packing_mips.h
index e90536f4908..ccc57f8af7b 100644
--- a/src/layer/mips/packing_mips.h
+++ b/src/layer/mips/packing_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Packing_mips : virtual public Packing
+class Packing_mips : public Packing
{
public:
Packing_mips();
diff --git a/src/layer/mips/padding_mips.h b/src/layer/mips/padding_mips.h
index 3153f3e2b35..6d4ae8c2f70 100644
--- a/src/layer/mips/padding_mips.h
+++ b/src/layer/mips/padding_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Padding_mips : virtual public Padding
+class Padding_mips : public Padding
{
public:
Padding_mips();
diff --git a/src/layer/mips/pooling_mips.h b/src/layer/mips/pooling_mips.h
index dab4038ecca..ec17a06a99c 100644
--- a/src/layer/mips/pooling_mips.h
+++ b/src/layer/mips/pooling_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Pooling_mips : virtual public Pooling
+class Pooling_mips : public Pooling
{
public:
Pooling_mips();
diff --git a/src/layer/mips/prelu_mips.h b/src/layer/mips/prelu_mips.h
index 9ef259ce833..6174c2570c3 100644
--- a/src/layer/mips/prelu_mips.h
+++ b/src/layer/mips/prelu_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class PReLU_mips : virtual public PReLU
+class PReLU_mips : public PReLU
{
public:
PReLU_mips();
diff --git a/src/layer/mips/quantize_mips.h b/src/layer/mips/quantize_mips.h
index 2607e573f5d..220d73af106 100644
--- a/src/layer/mips/quantize_mips.h
+++ b/src/layer/mips/quantize_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Quantize_mips : virtual public Quantize
+class Quantize_mips : public Quantize
{
public:
Quantize_mips();
diff --git a/src/layer/mips/relu_mips.h b/src/layer/mips/relu_mips.h
index 7fdeae828ef..74e55a6be10 100644
--- a/src/layer/mips/relu_mips.h
+++ b/src/layer/mips/relu_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ReLU_mips : virtual public ReLU
+class ReLU_mips : public ReLU
{
public:
ReLU_mips();
diff --git a/src/layer/mips/requantize_mips.h b/src/layer/mips/requantize_mips.h
index a9138b9ea72..6ba740895d2 100644
--- a/src/layer/mips/requantize_mips.h
+++ b/src/layer/mips/requantize_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Requantize_mips : virtual public Requantize
+class Requantize_mips : public Requantize
{
public:
Requantize_mips();
diff --git a/src/layer/mips/sigmoid_mips.h b/src/layer/mips/sigmoid_mips.h
index 7ba089b3b4c..2bf166e954d 100644
--- a/src/layer/mips/sigmoid_mips.h
+++ b/src/layer/mips/sigmoid_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Sigmoid_mips : virtual public Sigmoid
+class Sigmoid_mips : public Sigmoid
{
public:
Sigmoid_mips();
diff --git a/src/layer/mips/slice_mips.h b/src/layer/mips/slice_mips.h
index 648233f8e6c..73274d867a0 100644
--- a/src/layer/mips/slice_mips.h
+++ b/src/layer/mips/slice_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Slice_mips : virtual public Slice
+class Slice_mips : public Slice
{
public:
Slice_mips();
diff --git a/src/layer/mips/softmax_mips.h b/src/layer/mips/softmax_mips.h
index 06ce5e16284..91437c13f56 100644
--- a/src/layer/mips/softmax_mips.h
+++ b/src/layer/mips/softmax_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Softmax_mips : virtual public Softmax
+class Softmax_mips : public Softmax
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/mips/swish_mips.h b/src/layer/mips/swish_mips.h
index 706106d9269..1dc6753a381 100644
--- a/src/layer/mips/swish_mips.h
+++ b/src/layer/mips/swish_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Swish_mips : virtual public Swish
+class Swish_mips : public Swish
{
public:
Swish_mips();
diff --git a/src/layer/mips/tanh_mips.h b/src/layer/mips/tanh_mips.h
index d1310f18310..12e38d07f71 100644
--- a/src/layer/mips/tanh_mips.h
+++ b/src/layer/mips/tanh_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class TanH_mips : virtual public TanH
+class TanH_mips : public TanH
{
public:
TanH_mips();
diff --git a/src/layer/mips/unaryop_mips.h b/src/layer/mips/unaryop_mips.h
index 0a6f12bc3e5..800d028bb21 100644
--- a/src/layer/mips/unaryop_mips.h
+++ b/src/layer/mips/unaryop_mips.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class UnaryOp_mips : virtual public UnaryOp
+class UnaryOp_mips : public UnaryOp
{
public:
UnaryOp_mips();
diff --git a/src/layer/noop.cpp b/src/layer/noop.cpp
index 68572b0ba28..a8b42f70e83 100644
--- a/src/layer/noop.cpp
+++ b/src/layer/noop.cpp
@@ -20,11 +20,9 @@ namespace ncnn {
Noop::Noop()
{
support_inplace = true;
- support_vulkan = true;
support_packing = true;
support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zfh();
support_bf16_storage = true;
- support_image_storage = true;
}
int Noop::forward_inplace(std::vector& /*bottom_top_blobs*/, const Option& /*opt*/) const
@@ -32,16 +30,4 @@ int Noop::forward_inplace(std::vector& /*bottom_top_blobs*/, const Option&
return 0;
}
-#if NCNN_VULKAN
-int Noop::forward_inplace(std::vector& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
- return 0;
-}
-
-int Noop::forward_inplace(std::vector& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
- return 0;
-}
-#endif // NCNN_VULKAN
-
} // namespace ncnn
diff --git a/src/layer/noop.h b/src/layer/noop.h
index 1fb7af35c08..75bbdd1a308 100644
--- a/src/layer/noop.h
+++ b/src/layer/noop.h
@@ -25,11 +25,6 @@ class Noop : public Layer
Noop();
virtual int forward_inplace(std::vector& bottom_top_blobs, const Option& opt) const;
-
-#if NCNN_VULKAN
- virtual int forward_inplace(std::vector& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
- virtual int forward_inplace(std::vector& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
-#endif // NCNN_VULKAN
};
} // namespace ncnn
diff --git a/src/layer/reduction.cpp b/src/layer/reduction.cpp
index 4d4f7fb578b..55648f8eaf1 100644
--- a/src/layer/reduction.cpp
+++ b/src/layer/reduction.cpp
@@ -1064,7 +1064,11 @@ struct post_process_sqrt
{
T operator()(const T& x) const
{
- return static_cast(sqrtf(x));
+ // math optimization will probably generate rsqrt
+ // that produce -inf on sse with subnormal input
+ // flush subnormal input to zero as a workaround
+ // TODO explicit use simd sqrt like unaryop --- nihui
+ return static_cast(sqrtf(x < FLT_MIN ? 0.f : x));
}
};
diff --git a/src/layer/riscv/absval_riscv.h b/src/layer/riscv/absval_riscv.h
index 66d33c834a8..0d35c6b61a0 100644
--- a/src/layer/riscv/absval_riscv.h
+++ b/src/layer/riscv/absval_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class AbsVal_riscv : virtual public AbsVal
+class AbsVal_riscv : public AbsVal
{
public:
AbsVal_riscv();
diff --git a/src/layer/riscv/batchnorm_riscv.h b/src/layer/riscv/batchnorm_riscv.h
index e2365fa5fcf..1ed4dc63d0d 100644
--- a/src/layer/riscv/batchnorm_riscv.h
+++ b/src/layer/riscv/batchnorm_riscv.h
@@ -18,7 +18,7 @@
#include "batchnorm.h"
namespace ncnn {
-class BatchNorm_riscv : virtual public BatchNorm
+class BatchNorm_riscv : public BatchNorm
{
public:
BatchNorm_riscv();
diff --git a/src/layer/riscv/binaryop_riscv.h b/src/layer/riscv/binaryop_riscv.h
index 0ecd34d685c..afc728b6e68 100644
--- a/src/layer/riscv/binaryop_riscv.h
+++ b/src/layer/riscv/binaryop_riscv.h
@@ -21,7 +21,7 @@
namespace ncnn {
-class BinaryOp_riscv : virtual public BinaryOp
+class BinaryOp_riscv : public BinaryOp
{
public:
BinaryOp_riscv();
diff --git a/src/layer/riscv/cast_riscv.h b/src/layer/riscv/cast_riscv.h
index 4b55159d819..7c6fbb6d4ce 100644
--- a/src/layer/riscv/cast_riscv.h
+++ b/src/layer/riscv/cast_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Cast_riscv : virtual public Cast
+class Cast_riscv : public Cast
{
public:
Cast_riscv();
diff --git a/src/layer/riscv/clip_riscv.h b/src/layer/riscv/clip_riscv.h
index 16a9eb963f9..051995e18d6 100644
--- a/src/layer/riscv/clip_riscv.h
+++ b/src/layer/riscv/clip_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Clip_riscv : virtual public Clip
+class Clip_riscv : public Clip
{
public:
Clip_riscv();
diff --git a/src/layer/riscv/concat_riscv.h b/src/layer/riscv/concat_riscv.h
index eb85d47819d..23029340350 100644
--- a/src/layer/riscv/concat_riscv.h
+++ b/src/layer/riscv/concat_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Concat_riscv : virtual public Concat
+class Concat_riscv : public Concat
{
public:
Concat_riscv();
diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp
index d3d17861d89..6c581a0edeb 100644
--- a/src/layer/riscv/convolution1d_riscv.cpp
+++ b/src/layer/riscv/convolution1d_riscv.cpp
@@ -387,7 +387,7 @@ int Convolution1D_riscv::forward(const std::vector& bottom_blobs, std::vect
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -470,6 +470,8 @@ int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
+ weight_data.release();
+
return 0;
}
diff --git a/src/layer/riscv/convolution1d_riscv.h b/src/layer/riscv/convolution1d_riscv.h
index 2aa4bbe0f41..f0e7f881801 100644
--- a/src/layer/riscv/convolution1d_riscv.h
+++ b/src/layer/riscv/convolution1d_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Convolution1D_riscv : virtual public Convolution1D
+class Convolution1D_riscv : public Convolution1D
{
public:
Convolution1D_riscv();
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index 4c4d57c6a57..be413e5be25 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -237,10 +237,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -677,7 +674,7 @@ int Convolution_riscv::forward(const std::vector& bottom_blobs, std::vector
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -837,10 +834,7 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h
index 17bb43ca0e5..a4e008c9dd1 100644
--- a/src/layer/riscv/convolution_riscv.h
+++ b/src/layer/riscv/convolution_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Convolution_riscv : virtual public Convolution
+class Convolution_riscv : public Convolution
{
public:
Convolution_riscv();
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
index eb39ac0baa7..d913fe7e1d5 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -104,10 +104,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
weight_data_tm = weight_data;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -115,10 +112,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -146,7 +140,7 @@ int ConvolutionDepthWise_riscv::create_group_ops(const Option& opt)
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
// set param
ncnn::ParamDict pd;
@@ -613,7 +607,7 @@ int ConvolutionDepthWise_riscv::forward(const std::vector& bottom_blobs, st
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -688,10 +682,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -699,10 +690,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h
index b0152e0b207..f9503975296 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.h
+++ b/src/layer/riscv/convolutiondepthwise_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ConvolutionDepthWise_riscv : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_riscv : public ConvolutionDepthWise
{
public:
ConvolutionDepthWise_riscv();
diff --git a/src/layer/riscv/crop_riscv.h b/src/layer/riscv/crop_riscv.h
index 86d2c8064e3..404022fafb2 100644
--- a/src/layer/riscv/crop_riscv.h
+++ b/src/layer/riscv/crop_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Crop_riscv : virtual public Crop
+class Crop_riscv : public Crop
{
public:
Crop_riscv();
diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp
index 9202d367f93..6b395282908 100644
--- a/src/layer/riscv/deconvolution_riscv.cpp
+++ b/src/layer/riscv/deconvolution_riscv.cpp
@@ -148,10 +148,7 @@ int Deconvolution_riscv::create_pipeline(const Option& opt)
{
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -404,7 +401,7 @@ int Deconvolution_riscv::forward(const std::vector& bottom_blobs, std::vect
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -533,10 +530,7 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/riscv/deconvolution_riscv.h b/src/layer/riscv/deconvolution_riscv.h
index 903a420427a..57d30349aad 100644
--- a/src/layer/riscv/deconvolution_riscv.h
+++ b/src/layer/riscv/deconvolution_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Deconvolution_riscv : virtual public Deconvolution
+class Deconvolution_riscv : public Deconvolution
{
public:
Deconvolution_riscv();
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
index eee765c4ea6..7b567cf63e0 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -97,10 +97,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
weight_data_tm = weight_data_transposed;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -108,10 +105,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -139,7 +133,7 @@ int DeconvolutionDepthWise_riscv::create_group_ops(const Option& opt)
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
// set param
ncnn::ParamDict pd;
@@ -531,7 +525,7 @@ int DeconvolutionDepthWise_riscv::forward(const std::vector& bottom_blobs,
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -625,10 +619,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -636,10 +627,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.h b/src/layer/riscv/deconvolutiondepthwise_riscv.h
index 5cdbd0d0676..b0c8f7b0119 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.h
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class DeconvolutionDepthWise_riscv : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_riscv : public DeconvolutionDepthWise
{
public:
DeconvolutionDepthWise_riscv();
diff --git a/src/layer/riscv/dropout_riscv.h b/src/layer/riscv/dropout_riscv.h
index d685c0ee3b4..9c28d867251 100644
--- a/src/layer/riscv/dropout_riscv.h
+++ b/src/layer/riscv/dropout_riscv.h
@@ -22,7 +22,7 @@
namespace ncnn {
-class Dropout_riscv : virtual public Dropout
+class Dropout_riscv : public Dropout
{
public:
Dropout_riscv();
diff --git a/src/layer/riscv/flatten_riscv.h b/src/layer/riscv/flatten_riscv.h
index 52a290ca678..31860340213 100644
--- a/src/layer/riscv/flatten_riscv.h
+++ b/src/layer/riscv/flatten_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Flatten_riscv : virtual public Flatten
+class Flatten_riscv : public Flatten
{
public:
Flatten_riscv();
diff --git a/src/layer/riscv/gelu_riscv.h b/src/layer/riscv/gelu_riscv.h
index fbe522694d1..8a2e9492cc9 100644
--- a/src/layer/riscv/gelu_riscv.h
+++ b/src/layer/riscv/gelu_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class GELU_riscv : virtual public GELU
+class GELU_riscv : public GELU
{
public:
GELU_riscv();
diff --git a/src/layer/riscv/gemm_riscv.cpp b/src/layer/riscv/gemm_riscv.cpp
index ec5a5cdac41..9b4b58ac651 100644
--- a/src/layer/riscv/gemm_riscv.cpp
+++ b/src/layer/riscv/gemm_riscv.cpp
@@ -99,23 +99,10 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max
vfloat32m1_t _r6h = vle32_v_f32m1(p6 + 4, vl);
vfloat32m1_t _r7l = vle32_v_f32m1(p7, vl);
vfloat32m1_t _r7h = vle32_v_f32m1(p7 + 4, vl);
- transpose8x8_ps(_r0l, _r0h, _r1l, _r1h, _r2l, _r2h, _r3l, _r3h, _r4l, _r4h, _r5l, _r5h, _r6l, _r6h, _r7l, _r7h, vl);
- vse32_v_f32m1(pp, _r0l, vl);
- vse32_v_f32m1(pp + 4, _r0h, vl);
- vse32_v_f32m1(pp + 8, _r1l, vl);
- vse32_v_f32m1(pp + 12, _r1h, vl);
- vse32_v_f32m1(pp + 8 * 2, _r2l, vl);
- vse32_v_f32m1(pp + 8 * 2 + 4, _r2h, vl);
- vse32_v_f32m1(pp + 8 * 3, _r3l, vl);
- vse32_v_f32m1(pp + 8 * 3 + 4, _r3h, vl);
- vse32_v_f32m1(pp + 8 * 4, _r4l, vl);
- vse32_v_f32m1(pp + 8 * 4 + 4, _r4h, vl);
- vse32_v_f32m1(pp + 8 * 5, _r5l, vl);
- vse32_v_f32m1(pp + 8 * 5 + 4, _r5h, vl);
- vse32_v_f32m1(pp + 8 * 6, _r6l, vl);
- vse32_v_f32m1(pp + 8 * 6 + 4, _r6h, vl);
- vse32_v_f32m1(pp + 8 * 7, _r7l, vl);
- vse32_v_f32m1(pp + 8 * 7 + 4, _r7h, vl);
+
+ vsseg8e32_v_f32m1(pp, _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl);
+ vsseg8e32_v_f32m1(pp + 32, _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl);
+
pp += 64;
p0 += 8;
p1 += 8;
@@ -175,7 +162,7 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max
vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
vfloat32m1_t v2 = vle32_v_f32m1(p2, vl);
vfloat32m1_t v3 = vle32_v_f32m1(p3, vl);
- store_float_v4(v0, v1, v2, v3, pp, vl);
+ vsseg4e32_v_f32m1(pp, v0, v1, v2, v3, vl);
pp += 16;
p0 += 4;
p1 += 4;
@@ -210,7 +197,7 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max
{
vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
- store_float_v2(v0, v1, pp, vl);
+ vsseg2e32_v_f32m1(pp, v0, v1, vl);
pp += 8;
p0 += 4;
p1 += 4;
@@ -353,7 +340,7 @@ static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int
{
vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
vfloat32m1_t v1 = vle32_v_f32m1(p0 + 4, vl);
- store_float_v2(v0, v1, pp, vl);
+ vsseg2e32_v_f32m1(pp, v0, v1, vl);
pp += 8;
p0 += A_hstep * 4;
}
@@ -562,17 +549,8 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
vfloat32m1_t _r6 = vle32_v_f32m1(p6, vl);
vfloat32m1_t _r7 = vle32_v_f32m1(p7, vl);
- transpose4x4_ps(_r0, _r1, _r2, _r3, vl);
- transpose4x4_ps(_r4, _r5, _r6, _r7, vl);
+ vsseg8e32_v_f32m1(pp, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, vl);
- vse32_v_f32m1(pp, _r0, vl);
- vse32_v_f32m1(pp + 4, _r4, vl);
- vse32_v_f32m1(pp + 4 * 2, _r1, vl);
- vse32_v_f32m1(pp + 4 * 3, _r5, vl);
- vse32_v_f32m1(pp + 4 * 4, _r2, vl);
- vse32_v_f32m1(pp + 4 * 5, _r6, vl);
- vse32_v_f32m1(pp + 4 * 6, _r3, vl);
- vse32_v_f32m1(pp + 4 * 7, _r7, vl);
pp += 32;
p0 += 4;
p1 += 4;
@@ -632,7 +610,7 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
vfloat32m1_t v2 = vle32_v_f32m1(p2, vl);
vfloat32m1_t v3 = vle32_v_f32m1(p3, vl);
- store_float_v4(v0, v1, v2, v3, pp, vl);
+ vsseg4e32_v_f32m1(pp, v0, v1, v2, v3, vl);
pp += 16;
p0 += 4;
p1 += 4;
@@ -667,7 +645,7 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
{
vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
- store_float_v2(v0, v1, pp, vl);
+ vsseg2e32_v_f32m1(pp, v0, v1, vl);
pp += 8;
p0 += 4;
p1 += 4;
@@ -865,7 +843,7 @@ static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int
{
vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
vfloat32m1_t v1 = vle32_v_f32m1(p0 + 4, vl);
- store_float_v2(v0, v1, pp, vl);
+ vsseg2e32_v_f32m1(pp, v0, v1, vl);
pp += 8;
p0 += B_hstep * 4;
}
@@ -937,12 +915,12 @@ static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i,
vfloat32m1_t v1 = vle32_v_f32m1(pp + 8, vl);
vfloat32m1_t v2 = vle32_v_f32m1(pp + 16, vl);
vfloat32m1_t v3 = vle32_v_f32m1(pp + 24, vl);
- store_float_v4(v0, v1, v2, v3, p0, vl);
+ vsseg4e32_v_f32m1(p0, v0, v1, v2, v3, vl);
v0 = vle32_v_f32m1(pp + 4, vl);
v1 = vle32_v_f32m1(pp + 12, vl);
v2 = vle32_v_f32m1(pp + 20, vl);
v3 = vle32_v_f32m1(pp + 28, vl);
- store_float_v4(v0, v1, v2, v3, p0 + 16, vl);
+ vsseg4e32_v_f32m1(p0 + 16, v0, v1, v2, v3, vl);
pp += 32;
p0 += out_hstep * 4;
}
@@ -974,7 +952,7 @@ static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i,
vfloat32m1_t v1 = vle32_v_f32m1(pp + 4, vl);
vfloat32m1_t v2 = vle32_v_f32m1(pp + 8, vl);
vfloat32m1_t v3 = vle32_v_f32m1(pp + 12, vl);
- store_float_v4(v0, v1, v2, v3, p0, vl);
+ vsseg4e32_v_f32m1(p0, v0, v1, v2, v3, vl);
pp += 16;
p0 += out_hstep * 4;
}
@@ -2887,9 +2865,9 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
}
else
{
- store_float_v2(_sum00, _sum10, outptr, vl);
- store_float_v2(_sum01, _sum11, outptr + 8, vl);
- store_float_v2(_sum02, _sum12, outptr + 16, vl);
+ vsseg2e32_v_f32m1(outptr, _sum00, _sum10, vl);
+ vsseg2e32_v_f32m1(outptr + 8, _sum01, _sum11, vl);
+ vsseg2e32_v_f32m1(outptr + 16, _sum02, _sum12, vl);
}
outptr += 24;
@@ -2974,8 +2952,8 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
}
else
{
- store_float_v2(_sum00, _sum10, outptr, vl);
- store_float_v2(_sum01, _sum11, outptr + 8, vl);
+ vsseg2e32_v_f32m1(outptr, _sum00, _sum10, vl);
+ vsseg2e32_v_f32m1(outptr + 8, _sum01, _sum11, vl);
}
outptr += 16;
@@ -3048,7 +3026,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
}
else
{
- store_float_v2(_sum0, _sum1, outptr, vl);
+ vsseg2e32_v_f32m1(outptr, _sum0, _sum1, vl);
}
outptr += 8;
@@ -4006,10 +3984,7 @@ int Gemm_riscv::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- A_data.release();
- }
+ A_data.release();
}
if (constantB)
@@ -4049,10 +4024,7 @@ int Gemm_riscv::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- B_data.release();
- }
+ B_data.release();
}
if (constantC && constant_broadcast_type_C != -1)
@@ -4082,10 +4054,7 @@ int Gemm_riscv::create_pipeline(const Option& opt)
CT_data = C2;
}
- if (opt.lightmode)
- {
- C_data.release();
- }
+ C_data.release();
}
if (constantA || constantB || constantC)
diff --git a/src/layer/riscv/gemm_riscv.h b/src/layer/riscv/gemm_riscv.h
index b92add63891..6bca092fb1f 100644
--- a/src/layer/riscv/gemm_riscv.h
+++ b/src/layer/riscv/gemm_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Gemm_riscv : virtual public Gemm
+class Gemm_riscv : public Gemm
{
public:
Gemm_riscv();
diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp
index 28afa5081d0..c7e36c1c0fc 100644
--- a/src/layer/riscv/gru_riscv.cpp
+++ b/src/layer/riscv/gru_riscv.cpp
@@ -714,6 +714,10 @@ int GRU_riscv::create_pipeline_fp16sa(const Option& opt)
cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt);
cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt);
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
+
return 0;
}
diff --git a/src/layer/riscv/gru_riscv.h b/src/layer/riscv/gru_riscv.h
index 18c69ab594b..46bb624519f 100644
--- a/src/layer/riscv/gru_riscv.h
+++ b/src/layer/riscv/gru_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class GRU_riscv : virtual public GRU
+class GRU_riscv : public GRU
{
public:
GRU_riscv();
diff --git a/src/layer/riscv/hardsigmoid_riscv.h b/src/layer/riscv/hardsigmoid_riscv.h
index b876c485b62..3c264b3188e 100644
--- a/src/layer/riscv/hardsigmoid_riscv.h
+++ b/src/layer/riscv/hardsigmoid_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSigmoid_riscv : virtual public HardSigmoid
+class HardSigmoid_riscv : public HardSigmoid
{
public:
HardSigmoid_riscv();
diff --git a/src/layer/riscv/hardswish_riscv.h b/src/layer/riscv/hardswish_riscv.h
index 662cd067024..cfec7916f59 100644
--- a/src/layer/riscv/hardswish_riscv.h
+++ b/src/layer/riscv/hardswish_riscv.h
@@ -22,7 +22,7 @@
namespace ncnn {
-class HardSwish_riscv : virtual public HardSwish
+class HardSwish_riscv : public HardSwish
{
public:
HardSwish_riscv();
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index ac7b3169708..accfc683584 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -40,7 +40,7 @@ InnerProduct_riscv::InnerProduct_riscv()
int InnerProduct_riscv::create_pipeline(const Option& opt)
{
{
- flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+ flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
ncnn::ParamDict pd;
@@ -106,10 +106,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
weight_data_tm = weight_data;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -563,10 +560,7 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)
ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h
index 0503ea3d4fa..d3056d5801d 100644
--- a/src/layer/riscv/innerproduct_riscv.h
+++ b/src/layer/riscv/innerproduct_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class InnerProduct_riscv : virtual public InnerProduct
+class InnerProduct_riscv : public InnerProduct
{
public:
InnerProduct_riscv();
diff --git a/src/layer/riscv/instancenorm_riscv.h b/src/layer/riscv/instancenorm_riscv.h
index 80583cc2c89..b0d2e9004ac 100644
--- a/src/layer/riscv/instancenorm_riscv.h
+++ b/src/layer/riscv/instancenorm_riscv.h
@@ -18,7 +18,7 @@
#include "instancenorm.h"
namespace ncnn {
-class InstanceNorm_riscv : virtual public InstanceNorm
+class InstanceNorm_riscv : public InstanceNorm
{
public:
InstanceNorm_riscv();
diff --git a/src/layer/riscv/interp_riscv.h b/src/layer/riscv/interp_riscv.h
index 2f6ca89da34..f479223519b 100644
--- a/src/layer/riscv/interp_riscv.h
+++ b/src/layer/riscv/interp_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Interp_riscv : virtual public Interp
+class Interp_riscv : public Interp
{
public:
Interp_riscv();
diff --git a/src/layer/riscv/mish_riscv.h b/src/layer/riscv/mish_riscv.h
index 5421ebb2791..2e2be1a2b44 100644
--- a/src/layer/riscv/mish_riscv.h
+++ b/src/layer/riscv/mish_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Mish_riscv : virtual public Mish
+class Mish_riscv : public Mish
{
public:
Mish_riscv();
diff --git a/src/layer/riscv/packing_riscv.h b/src/layer/riscv/packing_riscv.h
index 4d556890f3f..097d774993c 100644
--- a/src/layer/riscv/packing_riscv.h
+++ b/src/layer/riscv/packing_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Packing_riscv : virtual public Packing
+class Packing_riscv : public Packing
{
public:
Packing_riscv();
diff --git a/src/layer/riscv/padding_riscv.h b/src/layer/riscv/padding_riscv.h
index c591806fa3e..7642dccae5f 100644
--- a/src/layer/riscv/padding_riscv.h
+++ b/src/layer/riscv/padding_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Padding_riscv : virtual public Padding
+class Padding_riscv : public Padding
{
public:
Padding_riscv();
diff --git a/src/layer/riscv/pooling_riscv.h b/src/layer/riscv/pooling_riscv.h
index 48d8feb8233..e285b58eb19 100644
--- a/src/layer/riscv/pooling_riscv.h
+++ b/src/layer/riscv/pooling_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Pooling_riscv : virtual public Pooling
+class Pooling_riscv : public Pooling
{
public:
Pooling_riscv();
diff --git a/src/layer/riscv/prelu_riscv.h b/src/layer/riscv/prelu_riscv.h
index 23e5b7ee998..70acbc5d250 100644
--- a/src/layer/riscv/prelu_riscv.h
+++ b/src/layer/riscv/prelu_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class PReLU_riscv : virtual public PReLU
+class PReLU_riscv : public PReLU
{
public:
PReLU_riscv();
diff --git a/src/layer/riscv/relu_riscv.h b/src/layer/riscv/relu_riscv.h
index 516f90d3d76..58181b533b8 100644
--- a/src/layer/riscv/relu_riscv.h
+++ b/src/layer/riscv/relu_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ReLU_riscv : virtual public ReLU
+class ReLU_riscv : public ReLU
{
public:
ReLU_riscv();
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index 938d3ce3998..e2824646f87 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -86,282 +86,6 @@ static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
return vloxei32_v_f32m8(ptr, bindex, vl);
}
-static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
- vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
- vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
- vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
- vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
- vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
- vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
- vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl)
-{
- float tmp[8][8];
- vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0l, vl);
- vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 8, _r0h, vl);
- vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1l, vl);
- vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 8, _r1h, vl);
- vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2l, vl);
- vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 8, _r2h, vl);
- vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3l, vl);
- vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 8, _r3h, vl);
- vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4l, vl);
- vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 8, _r4h, vl);
- vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5l, vl);
- vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 8, _r5h, vl);
- vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6l, vl);
- vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 8, _r6h, vl);
- vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7l, vl);
- vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 8, _r7h, vl);
- float* ptr = (float*)tmp;
- _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
- _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
- _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
- _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
- _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
- _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
- _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
- _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
- _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
- _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
- _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
- _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
- _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
- _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
- _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
- _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
-}
-
-static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl)
-{
- float tmp[4][4];
- vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 4, _r0, vl);
- vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 4, _r1, vl);
- vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 4, _r2, vl);
- vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 4, _r3, vl);
- float* ptr = (float*)tmp;
- _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
- _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
- _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
- _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
-}
-
-static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
- vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
- vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
- vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
- vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
- vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
- vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
- vfloat32m1_t& _r7l, vfloat32m1_t& _r7h,
- vfloat32m1_t& _r8l, vfloat32m1_t& _r8h,
- vfloat32m1_t& _r9l, vfloat32m1_t& _r9h,
- vfloat32m1_t& _ral, vfloat32m1_t& _rah,
- vfloat32m1_t& _rbl, vfloat32m1_t& _rbh, size_t vl)
-{
- float tmp[8][12];
- vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl);
- vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl);
- vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl);
- vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 12, _r1h, vl);
- vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2l, vl);
- vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 12, _r2h, vl);
- vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3l, vl);
- vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 12, _r3h, vl);
- vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4l, vl);
- vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 12, _r4h, vl);
- vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5l, vl);
- vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 12, _r5h, vl);
- vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6l, vl);
- vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 12, _r6h, vl);
- vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7l, vl);
- vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 12, _r7h, vl);
- vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8l, vl);
- vsse32_v_f32m1(&tmp[4][8], sizeof(float) * 12, _r8h, vl);
- vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9l, vl);
- vsse32_v_f32m1(&tmp[4][9], sizeof(float) * 12, _r9h, vl);
- vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ral, vl);
- vsse32_v_f32m1(&tmp[4][10], sizeof(float) * 12, _rah, vl);
- vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rbl, vl);
- vsse32_v_f32m1(&tmp[4][11], sizeof(float) * 12, _rbh, vl);
- float* ptr = (float*)tmp;
- _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
- _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
- _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
- _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
- _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
- _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
- _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
- _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
- _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
- _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
- _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
- _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
- _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
- _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
- _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
- _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
- _r8l = vle32_v_f32m1(ptr + 16 * 4, vl);
- _r8h = vle32_v_f32m1(ptr + 17 * 4, vl);
- _r9l = vle32_v_f32m1(ptr + 18 * 4, vl);
- _r9h = vle32_v_f32m1(ptr + 19 * 4, vl);
- _ral = vle32_v_f32m1(ptr + 20 * 4, vl);
- _rah = vle32_v_f32m1(ptr + 21 * 4, vl);
- _rbl = vle32_v_f32m1(ptr + 22 * 4, vl);
- _rbh = vle32_v_f32m1(ptr + 23 * 4, vl);
-}
-
-static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vfloat32m1_t& _r0h,
- vfloat32m1_t& _r1l, vfloat32m1_t& _r1m, vfloat32m1_t& _r1h,
- vfloat32m1_t& _r2l, vfloat32m1_t& _r2m, vfloat32m1_t& _r2h,
- vfloat32m1_t& _r3l, vfloat32m1_t& _r3m, vfloat32m1_t& _r3h,
- vfloat32m1_t& _r4l, vfloat32m1_t& _r4m, vfloat32m1_t& _r4h,
- vfloat32m1_t& _r5l, vfloat32m1_t& _r5m, vfloat32m1_t& _r5h,
- vfloat32m1_t& _r6l, vfloat32m1_t& _r6m, vfloat32m1_t& _r6h,
- vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl)
-{
- float tmp[12][8];
- vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0l, vl);
- vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 8, _r0m, vl);
- vsse32_v_f32m1(&tmp[8][0], sizeof(float) * 8, _r0h, vl);
- vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1l, vl);
- vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 8, _r1m, vl);
- vsse32_v_f32m1(&tmp[8][0], sizeof(float) * 8, _r1h, vl);
- vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2l, vl);
- vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 8, _r2m, vl);
- vsse32_v_f32m1(&tmp[8][2], sizeof(float) * 8, _r2h, vl);
- vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3l, vl);
- vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 8, _r3m, vl);
- vsse32_v_f32m1(&tmp[8][3], sizeof(float) * 8, _r3h, vl);
- vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4l, vl);
- vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 8, _r4m, vl);
- vsse32_v_f32m1(&tmp[8][4], sizeof(float) * 8, _r4h, vl);
- vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5l, vl);
- vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 8, _r5m, vl);
- vsse32_v_f32m1(&tmp[8][5], sizeof(float) * 8, _r5h, vl);
- vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6l, vl);
- vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 8, _r6m, vl);
- vsse32_v_f32m1(&tmp[8][6], sizeof(float) * 8, _r6h, vl);
- vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7l, vl);
- vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 8, _r7m, vl);
- vsse32_v_f32m1(&tmp[8][7], sizeof(float) * 8, _r7h, vl);
- float* ptr = (float*)tmp;
- _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
- _r0m = vle32_v_f32m1(ptr + 1 * 4, vl);
- _r0h = vle32_v_f32m1(ptr + 2 * 4, vl);
- _r1l = vle32_v_f32m1(ptr + 3 * 4, vl);
- _r1m = vle32_v_f32m1(ptr + 4 * 4, vl);
- _r1h = vle32_v_f32m1(ptr + 5 * 4, vl);
- _r2l = vle32_v_f32m1(ptr + 6 * 4, vl);
- _r2m = vle32_v_f32m1(ptr + 7 * 4, vl);
- _r2h = vle32_v_f32m1(ptr + 8 * 4, vl);
- _r3l = vle32_v_f32m1(ptr + 9 * 4, vl);
- _r3m = vle32_v_f32m1(ptr + 10 * 4, vl);
- _r3h = vle32_v_f32m1(ptr + 11 * 4, vl);
- _r4l = vle32_v_f32m1(ptr + 12 * 4, vl);
- _r4m = vle32_v_f32m1(ptr + 13 * 4, vl);
- _r4h = vle32_v_f32m1(ptr + 14 * 4, vl);
- _r5l = vle32_v_f32m1(ptr + 15 * 4, vl);
- _r5m = vle32_v_f32m1(ptr + 16 * 4, vl);
- _r5h = vle32_v_f32m1(ptr + 17 * 4, vl);
- _r6l = vle32_v_f32m1(ptr + 18 * 4, vl);
- _r6m = vle32_v_f32m1(ptr + 19 * 4, vl);
- _r6h = vle32_v_f32m1(ptr + 20 * 4, vl);
- _r7l = vle32_v_f32m1(ptr + 21 * 4, vl);
- _r7m = vle32_v_f32m1(ptr + 22 * 4, vl);
- _r7h = vle32_v_f32m1(ptr + 23 * 4, vl);
-}
-
-static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl)
-{
- float tmp[4][8];
- vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0, vl);
- vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1, vl);
- vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2, vl);
- vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3, vl);
- vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4, vl);
- vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5, vl);
- vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6, vl);
- vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7, vl);
- float* ptr = (float*)tmp;
- _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
- _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
- _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
- _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
- _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
- _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
- _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
- _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
-}
-
-static inline void transpose4x12_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, vfloat32m1_t& _r8, vfloat32m1_t& _r9, vfloat32m1_t& _ra, vfloat32m1_t& _rb, size_t vl)
-{
- float tmp[4][12];
- vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0, vl);
- vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1, vl);
- vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2, vl);
- vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3, vl);
- vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4, vl);
- vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5, vl);
- vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6, vl);
- vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7, vl);
- vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8, vl);
- vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9, vl);
- vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ra, vl);
- vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rb, vl);
- float* ptr = (float*)tmp;
- _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
- _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
- _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
- _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
- _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
- _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
- _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
- _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
- _r8 = vle32_v_f32m1(ptr + 8 * 4, vl);
- _r9 = vle32_v_f32m1(ptr + 9 * 4, vl);
- _ra = vle32_v_f32m1(ptr + 10 * 4, vl);
- _rb = vle32_v_f32m1(ptr + 11 * 4, vl);
-}
-
-static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
- vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
- vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
- vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl)
-{
- float tmp[8][4];
- vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 4, _r0l, vl);
- vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 4, _r0h, vl);
- vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 4, _r1l, vl);
- vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 4, _r1h, vl);
- vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 4, _r2l, vl);
- vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 4, _r2h, vl);
- vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 4, _r3l, vl);
- vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 4, _r3h, vl);
- float* ptr = (float*)tmp;
- _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
- _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
- _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
- _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
- _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
- _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
- _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
- _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
-}
-
-static inline void store_float_v2(vfloat32m1_t& vector1, vfloat32m1_t& vector2, float* buf, size_t vl)
-{
- vsse32_v_f32m1(buf + 0, sizeof(float) * 2, vector1, vl);
- vsse32_v_f32m1(buf + 1, sizeof(float) * 2, vector2, vl);
-}
-
-static inline void store_float_v4(vfloat32m1_t& vector1, vfloat32m1_t& vector2, vfloat32m1_t& vector3, vfloat32m1_t& vector4, float* buf, size_t vl)
-{
- vsse32_v_f32m1(buf + 0, sizeof(float) * 4, vector1, vl);
- vsse32_v_f32m1(buf + 1, sizeof(float) * 4, vector2, vl);
- vsse32_v_f32m1(buf + 2, sizeof(float) * 4, vector3, vl);
- vsse32_v_f32m1(buf + 3, sizeof(float) * 4, vector4, vl);
-}
-
#if __riscv_zfh
static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr)
{
@@ -675,4 +399,221 @@ static inline void vlseg2e16_v_f16m4(vfloat16m4_t* v0, vfloat16m4_t* v1, const f
#endif // __riscv_zfh
#endif // __riscv_vector
+#ifdef __riscv_vector
+
+static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+ vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+ vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+ vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
+ vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
+ vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
+ vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
+ vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl)
+{
+ float tmp[64];
+ vsseg8e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl);
+ vsseg8e32_v_f32m1(&tmp[32], _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl);
+ float* ptr = (float*)tmp;
+ _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+ _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+ _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+ _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+ _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+ _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+ _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+ _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+ _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
+ _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
+ _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
+ _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
+ _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
+ _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
+ _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
+ _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
+}
+
+static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl)
+{
+ float tmp[16];
+ vsseg4e32_v_f32m1(&tmp[0], _r0, _r1, _r2, _r3, vl);
+ float* ptr = (float*)tmp;
+ _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+ _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+ _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+ _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+}
+
+static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+ vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+ vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+ vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
+ vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
+ vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
+ vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
+ vfloat32m1_t& _r7l, vfloat32m1_t& _r7h,
+ vfloat32m1_t& _r8l, vfloat32m1_t& _r8h,
+ vfloat32m1_t& _r9l, vfloat32m1_t& _r9h,
+ vfloat32m1_t& _ral, vfloat32m1_t& _rah,
+ vfloat32m1_t& _rbl, vfloat32m1_t& _rbh, size_t vl)
+{
+ float tmp[8][12];
+
+ vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl);
+ vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl);
+ vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl);
+ vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 12, _r1h, vl);
+ vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2l, vl);
+ vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 12, _r2h, vl);
+ vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3l, vl);
+ vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 12, _r3h, vl);
+ vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4l, vl);
+ vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 12, _r4h, vl);
+ vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5l, vl);
+ vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 12, _r5h, vl);
+ vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6l, vl);
+ vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 12, _r6h, vl);
+ vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7l, vl);
+ vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 12, _r7h, vl);
+ vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8l, vl);
+ vsse32_v_f32m1(&tmp[4][8], sizeof(float) * 12, _r8h, vl);
+ vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9l, vl);
+ vsse32_v_f32m1(&tmp[4][9], sizeof(float) * 12, _r9h, vl);
+ vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ral, vl);
+ vsse32_v_f32m1(&tmp[4][10], sizeof(float) * 12, _rah, vl);
+ vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rbl, vl);
+ vsse32_v_f32m1(&tmp[4][11], sizeof(float) * 12, _rbh, vl);
+ float* ptr = (float*)tmp;
+ _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+ _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+ _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+ _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+ _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+ _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+ _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+ _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+ _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
+ _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
+ _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
+ _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
+ _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
+ _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
+ _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
+ _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
+ _r8l = vle32_v_f32m1(ptr + 16 * 4, vl);
+ _r8h = vle32_v_f32m1(ptr + 17 * 4, vl);
+ _r9l = vle32_v_f32m1(ptr + 18 * 4, vl);
+ _r9h = vle32_v_f32m1(ptr + 19 * 4, vl);
+ _ral = vle32_v_f32m1(ptr + 20 * 4, vl);
+ _rah = vle32_v_f32m1(ptr + 21 * 4, vl);
+ _rbl = vle32_v_f32m1(ptr + 22 * 4, vl);
+ _rbh = vle32_v_f32m1(ptr + 23 * 4, vl);
+}
+
+static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vfloat32m1_t& _r0h,
+ vfloat32m1_t& _r1l, vfloat32m1_t& _r1m, vfloat32m1_t& _r1h,
+ vfloat32m1_t& _r2l, vfloat32m1_t& _r2m, vfloat32m1_t& _r2h,
+ vfloat32m1_t& _r3l, vfloat32m1_t& _r3m, vfloat32m1_t& _r3h,
+ vfloat32m1_t& _r4l, vfloat32m1_t& _r4m, vfloat32m1_t& _r4h,
+ vfloat32m1_t& _r5l, vfloat32m1_t& _r5m, vfloat32m1_t& _r5h,
+ vfloat32m1_t& _r6l, vfloat32m1_t& _r6m, vfloat32m1_t& _r6h,
+ vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl)
+{
+ float tmp[96];
+ vsseg8e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl);
+ vsseg8e32_v_f32m1(&tmp[32], _r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m, vl);
+ vsseg8e32_v_f32m1(&tmp[64], _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl);
+
+ float* ptr = (float*)tmp;
+ _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+ _r0m = vle32_v_f32m1(ptr + 1 * 4, vl);
+ _r0h = vle32_v_f32m1(ptr + 2 * 4, vl);
+ _r1l = vle32_v_f32m1(ptr + 3 * 4, vl);
+ _r1m = vle32_v_f32m1(ptr + 4 * 4, vl);
+ _r1h = vle32_v_f32m1(ptr + 5 * 4, vl);
+ _r2l = vle32_v_f32m1(ptr + 6 * 4, vl);
+ _r2m = vle32_v_f32m1(ptr + 7 * 4, vl);
+ _r2h = vle32_v_f32m1(ptr + 8 * 4, vl);
+ _r3l = vle32_v_f32m1(ptr + 9 * 4, vl);
+ _r3m = vle32_v_f32m1(ptr + 10 * 4, vl);
+ _r3h = vle32_v_f32m1(ptr + 11 * 4, vl);
+ _r4l = vle32_v_f32m1(ptr + 12 * 4, vl);
+ _r4m = vle32_v_f32m1(ptr + 13 * 4, vl);
+ _r4h = vle32_v_f32m1(ptr + 14 * 4, vl);
+ _r5l = vle32_v_f32m1(ptr + 15 * 4, vl);
+ _r5m = vle32_v_f32m1(ptr + 16 * 4, vl);
+ _r5h = vle32_v_f32m1(ptr + 17 * 4, vl);
+ _r6l = vle32_v_f32m1(ptr + 18 * 4, vl);
+ _r6m = vle32_v_f32m1(ptr + 19 * 4, vl);
+ _r6h = vle32_v_f32m1(ptr + 20 * 4, vl);
+ _r7l = vle32_v_f32m1(ptr + 21 * 4, vl);
+ _r7m = vle32_v_f32m1(ptr + 22 * 4, vl);
+ _r7h = vle32_v_f32m1(ptr + 23 * 4, vl);
+}
+
+static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl)
+{
+ float tmp[32];
+ vsseg8e32_v_f32m1(&tmp[0], _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, vl);
+
+ float* ptr = (float*)tmp;
+ _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+ _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+ _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+ _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+ _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
+ _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
+ _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
+ _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
+}
+
+static inline void transpose4x12_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, vfloat32m1_t& _r8, vfloat32m1_t& _r9, vfloat32m1_t& _ra, vfloat32m1_t& _rb, size_t vl)
+{
+ float tmp[4][12];
+ vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0, vl);
+ vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1, vl);
+ vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2, vl);
+ vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3, vl);
+ vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4, vl);
+ vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5, vl);
+ vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6, vl);
+ vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7, vl);
+ vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8, vl);
+ vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9, vl);
+ vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ra, vl);
+ vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rb, vl);
+ float* ptr = (float*)tmp;
+ _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+ _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+ _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+ _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+ _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
+ _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
+ _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
+ _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
+ _r8 = vle32_v_f32m1(ptr + 8 * 4, vl);
+ _r9 = vle32_v_f32m1(ptr + 9 * 4, vl);
+ _ra = vle32_v_f32m1(ptr + 10 * 4, vl);
+ _rb = vle32_v_f32m1(ptr + 11 * 4, vl);
+}
+
+static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+ vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+ vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+ vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl)
+{
+ float tmp[32];
+ vsseg4e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, vl);
+ vsseg4e32_v_f32m1(&tmp[16], _r0h, _r1h, _r2h, _r3h, vl);
+ float* ptr = (float*)tmp;
+ _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+ _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+ _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+ _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+ _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+ _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+ _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+ _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+}
+#endif
+
#endif // RISCV_USABILITY_H
diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h
index 34f072788e5..980261a1496 100644
--- a/src/layer/riscv/rvv_mathfun.h
+++ b/src/layer/riscv/rvv_mathfun.h
@@ -512,8 +512,8 @@ _RVV_FLOAT32_FMA_HELPER(1)
\
vfloat32m##LMUL##_t tu = vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vff_f32m##LMUL(t, c_erfc_ra7, c_erfc_ra6, vl), c_erfc_ra5, vl), c_erfc_ra4, vl), c_erfc_ra3, vl), c_erfc_ra2, vl), c_erfc_ra1, vl), c_erfc_ra0, vl); \
vfloat32m##LMUL##_t tv = vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vff_f32m##LMUL(t, c_erfc_sa8, c_erfc_sa7, vl), c_erfc_sa6, vl), c_erfc_sa5, vl), c_erfc_sa4, vl), c_erfc_sa3, vl), c_erfc_sa2, vl), c_erfc_sa1, vl); \
- u = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 0x1.6db6dap+1f, vl), tu, u, vl); /* u = absx < 0x1.6db6dap+1f ? tu : u;*/ \
- v = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 0x1.6db6dap+1f, vl), tv, v, vl); /* v = absx < 0x1.6db6dap+1f ? tv : v;*/ \
+ u = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 2.857143f, vl), tu, u, vl); /* u = absx < 0x1.6db6dap+1f ? tu : u;*/ \
+ v = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 2.857143f, vl), tv, v, vl); /* v = absx < 0x1.6db6dap+1f ? tv : v;*/ \
\
tu = vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vff_f32m##LMUL(t, c_erfc_pa6, c_erfc_pa5, vl), c_erfc_pa4, vl), c_erfc_pa3, vl), c_erfc_pa2, vl), c_erfc_pa1, vl), c_erfc_pa0, vl); \
tv = vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vff_f32m##LMUL(t, c_erfc_qa6, c_erfc_qa5, vl), c_erfc_qa4, vl), c_erfc_qa3, vl), c_erfc_qa2, vl), c_erfc_qa1, vl); \
@@ -532,28 +532,28 @@ _RVV_FLOAT32_FMA_HELPER(1)
vfloat32m##LMUL##_t q = vfdiv_vv_f32m##LMUL(u, v, vl); \
vfloat32m##LMUL##_t ret = vfmv_v_f_f32m##LMUL(0.f, vl); \
\
- vfloat32m##LMUL##_t z = vreinterpret_v_u32m##LMUL##_f32m##LMUL( vand_vx_u32m##LMUL(vreinterpret_v_f32m##LMUL##_u32m##LMUL(absx), 0xffff'f000, vl)); \
- \
- vfloat32m##LMUL##_t r = vfmul_vv_f32m##LMUL( exp_ps(vfmadd_vvf_f32m##LMUL(vfneg_v_f32m##LMUL(z, vl), z, -0.5625f, vl), vl), exp_ps(vfmadd_vv_f32m##LMUL(vfsub_vv_f32m##LMUL(z, absx, vl), vfadd_vv_f32m##LMUL(z, absx, vl), q, vl), vl), vl); \
- r = vfdiv_vv_f32m##LMUL(r, absx, vl); \
- t = vfrsub_vf_f32m##LMUL(r, 2.f, vl); \
- r = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl), t, r, vl); /* x < 0.f ? t:r */ \
- ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 28.f, vl), r, ret, vl); /* abs < 28.f ? r : ret */ \
- \
- r = vfrsub_vf_f32m##LMUL(q, 1.f - c_erfc_erx_f, vl); \
- t = vfadd_vf_f32m##LMUL(q, 1.f + c_erfc_erx_f, vl); \
- r = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl), t, r, vl); /* x < 0.f ? t:r*/ \
- ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 1.25f, vl), r, ret, vl); /* absx < 1.25f ? r : ret*/ \
- \
- r = vfrsub_vf_f32m##LMUL(vfmadd_vv_f32m##LMUL(x, q, vfsub_vf_f32m##LMUL(x, 0.5f, vl), vl), .5, vl); \
- ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 0.84375f, vl), r, ret, vl); /* absx < 0.84375f ? r : ret*/ \
- \
- ret = vfmerge_vfm_f32m##LMUL(vmflt_vf_f32m##LMUL##_b##MLEN(x, -6.0f, vl), ret, 2.f, vl); /* x< -6.0f ? 2.0f: ret*/ \
- \
- ret = vmerge_vvm_f32m##LMUL(vmfeq_vv_f32m##LMUL##_b##MLEN(x, x, vl), x, ret, vl); /* erfc(NaN) = NaN*/ \
- \
- return ret; \
-}
+ vfloat32m##LMUL##_t z = vreinterpret_v_u32m##LMUL##_f32m##LMUL(vand_vx_u32m##LMUL(vreinterpret_v_f32m##LMUL##_u32m##LMUL(absx), 0xfffff000, vl)); \
+ \
+ vfloat32m##LMUL##_t r = vfmul_vv_f32m##LMUL(exp_ps(vfmadd_vvf_f32m##LMUL(vfneg_v_f32m##LMUL(z, vl), z, -0.5625f, vl), vl), exp_ps(vfmadd_vv_f32m##LMUL(vfsub_vv_f32m##LMUL(z, absx, vl), vfadd_vv_f32m##LMUL(z, absx, vl), q, vl), vl), vl); \
+ r = vfdiv_vv_f32m##LMUL(r, absx, vl); \
+ t = vfrsub_vf_f32m##LMUL(r, 2.f, vl); \
+ r = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl), t, r, vl); /* x < 0.f ? t:r */ \
+ ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 28.f, vl), r, ret, vl); /* abs < 28.f ? r : ret */ \
+ \
+ r = vfrsub_vf_f32m##LMUL(q, 1.f - c_erfc_erx_f, vl); \
+ t = vfadd_vf_f32m##LMUL(q, 1.f + c_erfc_erx_f, vl); \
+ r = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl), t, r, vl); /* x < 0.f ? t:r*/ \
+ ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 1.25f, vl), r, ret, vl); /* absx < 1.25f ? r : ret*/ \
+ \
+ r = vfrsub_vf_f32m##LMUL(vfmadd_vv_f32m##LMUL(x, q, vfsub_vf_f32m##LMUL(x, 0.5f, vl), vl), .5, vl); \
+ ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 0.84375f, vl), r, ret, vl); /* absx < 0.84375f ? r : ret*/ \
+ \
+ ret = vfmerge_vfm_f32m##LMUL(vmflt_vf_f32m##LMUL##_b##MLEN(x, -6.0f, vl), ret, 2.f, vl); /* x< -6.0f ? 2.0f: ret*/ \
+ \
+ ret = vmerge_vvm_f32m##LMUL(vmfeq_vv_f32m##LMUL##_b##MLEN(x, x, vl), x, ret, vl); /* erfc(NaN) = NaN*/ \
+ \
+ return ret; \
+ }
_RVV_FLOAT32_ERFC_OP(1, 32)
_RVV_FLOAT32_ERFC_OP(2, 16)
diff --git a/src/layer/riscv/selu_riscv.h b/src/layer/riscv/selu_riscv.h
index 2cd552fb9b8..185b7f5b2c8 100644
--- a/src/layer/riscv/selu_riscv.h
+++ b/src/layer/riscv/selu_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class SELU_riscv : virtual public SELU
+class SELU_riscv : public SELU
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/sigmoid_riscv.h b/src/layer/riscv/sigmoid_riscv.h
index 2b4b33b7cbe..8f014e6c4f2 100644
--- a/src/layer/riscv/sigmoid_riscv.h
+++ b/src/layer/riscv/sigmoid_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Sigmoid_riscv : virtual public Sigmoid
+class Sigmoid_riscv : public Sigmoid
{
public:
Sigmoid_riscv();
diff --git a/src/layer/riscv/softmax_riscv.h b/src/layer/riscv/softmax_riscv.h
index bb39b5e3ba8..f93dc3022e1 100644
--- a/src/layer/riscv/softmax_riscv.h
+++ b/src/layer/riscv/softmax_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Softmax_riscv : virtual public Softmax
+class Softmax_riscv : public Softmax
{
public:
Softmax_riscv();
diff --git a/src/layer/riscv/swish_riscv.h b/src/layer/riscv/swish_riscv.h
index 00de62fce4c..05d5cbe1cfd 100644
--- a/src/layer/riscv/swish_riscv.h
+++ b/src/layer/riscv/swish_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Swish_riscv : virtual public Swish
+class Swish_riscv : public Swish
{
public:
Swish_riscv();
diff --git a/src/layer/riscv/tanh_riscv.h b/src/layer/riscv/tanh_riscv.h
index c7038ef4f3e..6fb22ce91f3 100644
--- a/src/layer/riscv/tanh_riscv.h
+++ b/src/layer/riscv/tanh_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class TanH_riscv : virtual public TanH
+class TanH_riscv : public TanH
{
public:
TanH_riscv();
diff --git a/src/layer/riscv/unaryop_riscv.h b/src/layer/riscv/unaryop_riscv.h
index 7e4e4fa8bfe..215ad3426a4 100644
--- a/src/layer/riscv/unaryop_riscv.h
+++ b/src/layer/riscv/unaryop_riscv.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class UnaryOp_riscv : virtual public UnaryOp
+class UnaryOp_riscv : public UnaryOp
{
public:
UnaryOp_riscv();
diff --git a/src/layer/split.cpp b/src/layer/split.cpp
index f79fce0f15c..996624dfe7a 100644
--- a/src/layer/split.cpp
+++ b/src/layer/split.cpp
@@ -21,11 +21,9 @@ Split::Split()
{
one_blob_only = false;
support_inplace = false;
- support_vulkan = true;
support_packing = true;
support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zfh();
support_bf16_storage = true;
- support_image_storage = true;
}
int Split::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& /*opt*/) const
@@ -39,28 +37,4 @@ int Split::forward(const std::vector& bottom_blobs, std::vector& top_b
return 0;
}
-#if NCNN_VULKAN
-int Split::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
- const VkMat& bottom_blob = bottom_blobs[0];
- for (size_t i = 0; i < top_blobs.size(); i++)
- {
- top_blobs[i] = bottom_blob;
- }
-
- return 0;
-}
-
-int Split::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
- const VkImageMat& bottom_blob = bottom_blobs[0];
- for (size_t i = 0; i < top_blobs.size(); i++)
- {
- top_blobs[i] = bottom_blob;
- }
-
- return 0;
-}
-#endif // NCNN_VULKAN
-
} // namespace ncnn
diff --git a/src/layer/split.h b/src/layer/split.h
index 7437866cfc5..53686f82be3 100644
--- a/src/layer/split.h
+++ b/src/layer/split.h
@@ -25,13 +25,6 @@ class Split : public Layer
Split();
virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const;
-
-#if NCNN_VULKAN
- virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const;
- virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const;
-#endif // NCNN_VULKAN
-
-public:
};
} // namespace ncnn
diff --git a/src/layer/vulkan/absval_vulkan.h b/src/layer/vulkan/absval_vulkan.h
index d14c2ac5388..9652aac9b16 100644
--- a/src/layer/vulkan/absval_vulkan.h
+++ b/src/layer/vulkan/absval_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class AbsVal_vulkan : virtual public AbsVal
+class AbsVal_vulkan : public AbsVal
{
public:
AbsVal_vulkan();
diff --git a/src/layer/vulkan/batchnorm_vulkan.h b/src/layer/vulkan/batchnorm_vulkan.h
index 783b84b6efb..eedf049167d 100644
--- a/src/layer/vulkan/batchnorm_vulkan.h
+++ b/src/layer/vulkan/batchnorm_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BatchNorm_vulkan : virtual public BatchNorm
+class BatchNorm_vulkan : public BatchNorm
{
public:
BatchNorm_vulkan();
diff --git a/src/layer/vulkan/binaryop_vulkan.h b/src/layer/vulkan/binaryop_vulkan.h
index 97ebcacc9f6..1c66186a0c3 100644
--- a/src/layer/vulkan/binaryop_vulkan.h
+++ b/src/layer/vulkan/binaryop_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BinaryOp_vulkan : virtual public BinaryOp
+class BinaryOp_vulkan : public BinaryOp
{
public:
BinaryOp_vulkan();
diff --git a/src/layer/vulkan/cast_vulkan.h b/src/layer/vulkan/cast_vulkan.h
index c184c7439ac..47ce3b27920 100644
--- a/src/layer/vulkan/cast_vulkan.h
+++ b/src/layer/vulkan/cast_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Cast_vulkan : virtual public Cast
+class Cast_vulkan : public Cast
{
public:
Cast_vulkan();
diff --git a/src/layer/vulkan/celu_vulkan.h b/src/layer/vulkan/celu_vulkan.h
index b5e25e19b4d..2c03a4b9c98 100644
--- a/src/layer/vulkan/celu_vulkan.h
+++ b/src/layer/vulkan/celu_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class CELU_vulkan : virtual public CELU
+class CELU_vulkan : public CELU
{
public:
CELU_vulkan();
diff --git a/src/layer/vulkan/clip_vulkan.h b/src/layer/vulkan/clip_vulkan.h
index ea73eacd050..79e7745f0c4 100644
--- a/src/layer/vulkan/clip_vulkan.h
+++ b/src/layer/vulkan/clip_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Clip_vulkan : virtual public Clip
+class Clip_vulkan : public Clip
{
public:
Clip_vulkan();
diff --git a/src/layer/vulkan/concat_vulkan.h b/src/layer/vulkan/concat_vulkan.h
index 3db05044ea9..109750f3d8d 100644
--- a/src/layer/vulkan/concat_vulkan.h
+++ b/src/layer/vulkan/concat_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Concat_vulkan : virtual public Concat
+class Concat_vulkan : public Concat
{
public:
Concat_vulkan();
diff --git a/src/layer/vulkan/convolution1d_vulkan.cpp b/src/layer/vulkan/convolution1d_vulkan.cpp
index 53dff49262b..2747012addc 100644
--- a/src/layer/vulkan/convolution1d_vulkan.cpp
+++ b/src/layer/vulkan/convolution1d_vulkan.cpp
@@ -29,15 +29,21 @@ Convolution1D_vulkan::Convolution1D_vulkan()
pipeline_convolution1d = 0;
}
-int Convolution1D_vulkan::create_pipeline(const Option& _opt)
+int Convolution1D_vulkan::load_param(const ParamDict& pd)
{
+ int ret = Convolution1D::load_param(pd);
+
if (dynamic_weight)
{
support_vulkan = false;
support_image_storage = false;
- return 0;
}
+ return ret;
+}
+
+int Convolution1D_vulkan::create_pipeline(const Option& _opt)
+{
Option opt = _opt;
const int maxk = kernel_w;
@@ -47,7 +53,7 @@ int Convolution1D_vulkan::create_pipeline(const Option& _opt)
int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
{
- padding = ncnn::create_layer(ncnn::LayerType::Padding);
+ padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
padding->vkdev = vkdev;
ncnn::ParamDict pd;
@@ -127,6 +133,9 @@ int Convolution1D_vulkan::create_pipeline(const Option& _opt)
pipeline_convolution1d->create(shader_type_index, opt, specializations);
}
+ weight_data.release();
+ bias_data.release();
+
return 0;
}
diff --git a/src/layer/vulkan/convolution1d_vulkan.h b/src/layer/vulkan/convolution1d_vulkan.h
index 4fb22040daa..28d692ae618 100644
--- a/src/layer/vulkan/convolution1d_vulkan.h
+++ b/src/layer/vulkan/convolution1d_vulkan.h
@@ -19,11 +19,13 @@
namespace ncnn {
-class Convolution1D_vulkan : virtual public Convolution1D
+class Convolution1D_vulkan : public Convolution1D
{
public:
Convolution1D_vulkan();
+ virtual int load_param(const ParamDict& pd);
+
virtual int create_pipeline(const Option& opt);
virtual int destroy_pipeline(const Option& opt);
diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index beb0bccb9bf..302ab9085c5 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -43,15 +43,21 @@ Convolution_vulkan::Convolution_vulkan()
reshape_w = 0;
}
-int Convolution_vulkan::create_pipeline(const Option& _opt)
+int Convolution_vulkan::load_param(const ParamDict& pd)
{
+ int ret = Convolution::load_param(pd);
+
if (dynamic_weight)
{
support_vulkan = false;
support_image_storage = false;
- return 0;
}
+ return ret;
+}
+
+int Convolution_vulkan::create_pipeline(const Option& _opt)
+{
Option opt = _opt;
const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
@@ -117,7 +123,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
if (kernel_w == 1 && kernel_h == 1)
{
{
- reshape_1x1xw = ncnn::create_layer(ncnn::LayerType::Reshape);
+ reshape_1x1xw = ncnn::create_layer_vulkan(ncnn::LayerType::Reshape);
reshape_1x1xw->vkdev = vkdev;
reshape_1x1xw->bottom_shapes.resize(1);
@@ -136,7 +142,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
}
{
- reshape_w = ncnn::create_layer(ncnn::LayerType::Reshape);
+ reshape_w = ncnn::create_layer_vulkan(ncnn::LayerType::Reshape);
reshape_w->vkdev = vkdev;
reshape_w->bottom_shapes.resize(1);
@@ -157,7 +163,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
{
- padding = ncnn::create_layer(ncnn::LayerType::Padding);
+ padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
padding->vkdev = vkdev;
padding->bottom_shapes.resize(1);
@@ -1142,6 +1148,9 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
pipeline_convolution->create(shader_type_index, opt, specializations);
}
+ weight_data.release();
+ bias_data.release();
+
return 0;
}
diff --git a/src/layer/vulkan/convolution_vulkan.h b/src/layer/vulkan/convolution_vulkan.h
index 0efa76fec5c..fa4bdbc5350 100644
--- a/src/layer/vulkan/convolution_vulkan.h
+++ b/src/layer/vulkan/convolution_vulkan.h
@@ -19,11 +19,13 @@
namespace ncnn {
-class Convolution_vulkan : virtual public Convolution
+class Convolution_vulkan : public Convolution
{
public:
Convolution_vulkan();
+ virtual int load_param(const ParamDict& pd);
+
virtual int create_pipeline(const Option& opt);
virtual int destroy_pipeline(const Option& opt);
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
index 57069074c96..59eca6a55c6 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -41,15 +41,21 @@ ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
pipeline_convolutiondepthwise_group_pack8to1 = 0;
}
-int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+int ConvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
{
+ int ret = ConvolutionDepthWise::load_param(pd);
+
if (dynamic_weight)
{
support_vulkan = false;
support_image_storage = false;
- return 0;
}
+ return ret;
+}
+
+int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+{
Option opt = _opt;
const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
@@ -177,7 +183,7 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
}
{
- padding = ncnn::create_layer(ncnn::LayerType::Padding);
+ padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
padding->vkdev = vkdev;
padding->bottom_shapes.resize(1);
@@ -265,6 +271,9 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
pipeline_convolutiondepthwise_pack8->create(LayerShaderType::convolutiondepthwise_pack8, opt, specializations);
}
+ weight_data.release();
+ bias_data.release();
+
return 0;
}
@@ -404,6 +413,9 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
pipeline_convolutiondepthwise_group_pack8to1->create(LayerShaderType::convolutiondepthwise_group_pack8to1, opt, specializations);
}
+ weight_data.release();
+ bias_data.release();
+
return 0;
}
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.h b/src/layer/vulkan/convolutiondepthwise_vulkan.h
index 3689e369c2b..7a6cfe1f640 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.h
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.h
@@ -19,11 +19,13 @@
namespace ncnn {
-class ConvolutionDepthWise_vulkan : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_vulkan : public ConvolutionDepthWise
{
public:
ConvolutionDepthWise_vulkan();
+ virtual int load_param(const ParamDict& pd);
+
virtual int create_pipeline(const Option& opt);
virtual int destroy_pipeline(const Option& opt);
diff --git a/src/layer/vulkan/crop_vulkan.h b/src/layer/vulkan/crop_vulkan.h
index e60b77f5e7c..4480268849a 100644
--- a/src/layer/vulkan/crop_vulkan.h
+++ b/src/layer/vulkan/crop_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Crop_vulkan : virtual public Crop
+class Crop_vulkan : public Crop
{
public:
Crop_vulkan();
diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp
index c53aedefc84..66e57db57bf 100644
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -33,15 +33,21 @@ Deconvolution_vulkan::Deconvolution_vulkan()
pipeline_deconvolution_col2im = 0;
}
-int Deconvolution_vulkan::create_pipeline(const Option& _opt)
+int Deconvolution_vulkan::load_param(const ParamDict& pd)
{
+ int ret = Deconvolution::load_param(pd);
+
if (dynamic_weight)
{
support_vulkan = false;
support_image_storage = false;
- return 0;
}
+ return ret;
+}
+
+int Deconvolution_vulkan::create_pipeline(const Option& _opt)
+{
Option opt = _opt;
const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
@@ -109,7 +115,7 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
}
{
- crop = ncnn::create_layer(ncnn::LayerType::Crop);
+ crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
crop->vkdev = vkdev;
crop->bottom_shapes.resize(1);
@@ -128,7 +134,7 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
}
{
- output_crop = ncnn::create_layer(ncnn::LayerType::Crop);
+ output_crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
output_crop->vkdev = vkdev;
output_crop->bottom_shapes.resize(1);
@@ -456,6 +462,9 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
pipeline_deconvolution->set_optimal_local_size_xyz(local_size_xyz);
pipeline_deconvolution->create(shader_type_index, opt, specializations);
+ weight_data.release();
+ bias_data.release();
+
return 0;
}
diff --git a/src/layer/vulkan/deconvolution_vulkan.h b/src/layer/vulkan/deconvolution_vulkan.h
index 578bdc96747..6e18c38d681 100644
--- a/src/layer/vulkan/deconvolution_vulkan.h
+++ b/src/layer/vulkan/deconvolution_vulkan.h
@@ -19,11 +19,13 @@
namespace ncnn {
-class Deconvolution_vulkan : virtual public Deconvolution
+class Deconvolution_vulkan : public Deconvolution
{
public:
Deconvolution_vulkan();
+ virtual int load_param(const ParamDict& pd);
+
virtual int create_pipeline(const Option& opt);
virtual int destroy_pipeline(const Option& opt);
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
index b24418fa428..a715a4782f4 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
@@ -42,15 +42,21 @@ DeconvolutionDepthWise_vulkan::DeconvolutionDepthWise_vulkan()
pipeline_deconvolutiondepthwise_group_pack8to1 = 0;
}
-int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+int DeconvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
{
+ int ret = DeconvolutionDepthWise::load_param(pd);
+
if (dynamic_weight)
{
support_vulkan = false;
support_image_storage = false;
- return 0;
}
+ return ret;
+}
+
+int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+{
Option opt = _opt;
const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
@@ -168,7 +174,7 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
}
{
- crop = ncnn::create_layer(ncnn::LayerType::Crop);
+ crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
crop->vkdev = vkdev;
crop->bottom_shapes.resize(1);
@@ -187,7 +193,7 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
}
{
- output_crop = ncnn::create_layer(ncnn::LayerType::Crop);
+ output_crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
output_crop->vkdev = vkdev;
output_crop->bottom_shapes.resize(1);
@@ -289,6 +295,9 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
pipeline_deconvolutiondepthwise_pack8->create(LayerShaderType::deconvolutiondepthwise_pack8, opt, specializations);
}
+ weight_data.release();
+ bias_data.release();
+
return 0;
}
@@ -428,6 +437,9 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
pipeline_deconvolutiondepthwise_group_pack8to1->create(LayerShaderType::deconvolutiondepthwise_group_pack8to1, opt, specializations);
}
+ weight_data.release();
+ bias_data.release();
+
return 0;
}
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.h b/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
index bf38f254eb5..5346de8e628 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
@@ -19,11 +19,13 @@
namespace ncnn {
-class DeconvolutionDepthWise_vulkan : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_vulkan : public DeconvolutionDepthWise
{
public:
DeconvolutionDepthWise_vulkan();
+ virtual int load_param(const ParamDict& pd);
+
virtual int create_pipeline(const Option& opt);
virtual int destroy_pipeline(const Option& opt);
diff --git a/src/layer/vulkan/deepcopy_vulkan.h b/src/layer/vulkan/deepcopy_vulkan.h
index a7a89d17a67..867ff1af454 100644
--- a/src/layer/vulkan/deepcopy_vulkan.h
+++ b/src/layer/vulkan/deepcopy_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class DeepCopy_vulkan : virtual public DeepCopy
+class DeepCopy_vulkan : public DeepCopy
{
public:
DeepCopy_vulkan();
diff --git a/src/layer/vulkan/dropout_vulkan.h b/src/layer/vulkan/dropout_vulkan.h
index da2e9ad6051..e45159b7659 100644
--- a/src/layer/vulkan/dropout_vulkan.h
+++ b/src/layer/vulkan/dropout_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Dropout_vulkan : virtual public Dropout
+class Dropout_vulkan : public Dropout
{
public:
Dropout_vulkan();
diff --git a/src/layer/vulkan/eltwise_vulkan.h b/src/layer/vulkan/eltwise_vulkan.h
index 2516db55dd2..09418657186 100644
--- a/src/layer/vulkan/eltwise_vulkan.h
+++ b/src/layer/vulkan/eltwise_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Eltwise_vulkan : virtual public Eltwise
+class Eltwise_vulkan : public Eltwise
{
public:
Eltwise_vulkan();
diff --git a/src/layer/vulkan/elu_vulkan.h b/src/layer/vulkan/elu_vulkan.h
index 62da80a00c5..c616c3be1b9 100644
--- a/src/layer/vulkan/elu_vulkan.h
+++ b/src/layer/vulkan/elu_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ELU_vulkan : virtual public ELU
+class ELU_vulkan : public ELU
{
public:
ELU_vulkan();
diff --git a/src/layer/vulkan/erf_vulkan.h b/src/layer/vulkan/erf_vulkan.h
index c793c558687..3f2ae5ace64 100644
--- a/src/layer/vulkan/erf_vulkan.h
+++ b/src/layer/vulkan/erf_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Erf_vulkan : virtual public Erf
+class Erf_vulkan : public Erf
{
public:
Erf_vulkan();
diff --git a/src/layer/vulkan/flatten_vulkan.h b/src/layer/vulkan/flatten_vulkan.h
index 510cab1285f..1068ce547c3 100644
--- a/src/layer/vulkan/flatten_vulkan.h
+++ b/src/layer/vulkan/flatten_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Flatten_vulkan : virtual public Flatten
+class Flatten_vulkan : public Flatten
{
public:
Flatten_vulkan();
diff --git a/src/layer/vulkan/gelu_vulkan.h b/src/layer/vulkan/gelu_vulkan.h
index 2c04bc40ba1..ced6f07af4d 100644
--- a/src/layer/vulkan/gelu_vulkan.h
+++ b/src/layer/vulkan/gelu_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class GELU_vulkan : virtual public GELU
+class GELU_vulkan : public GELU
{
public:
GELU_vulkan();
diff --git a/src/layer/vulkan/gemm_vulkan.cpp b/src/layer/vulkan/gemm_vulkan.cpp
index ad768c63dd2..f30fa552f11 100644
--- a/src/layer/vulkan/gemm_vulkan.cpp
+++ b/src/layer/vulkan/gemm_vulkan.cpp
@@ -100,6 +100,10 @@ int Gemm_vulkan::create_pipeline(const Option& opt)
pipeline_gemm->create(LayerShaderType::gemm, opt, specializations);
}
+ A_data.release();
+ B_data.release();
+ C_data.release();
+
return 0;
}
diff --git a/src/layer/vulkan/gemm_vulkan.h b/src/layer/vulkan/gemm_vulkan.h
index 4edbc2f5472..d9fa92018e4 100644
--- a/src/layer/vulkan/gemm_vulkan.h
+++ b/src/layer/vulkan/gemm_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Gemm_vulkan : virtual public Gemm
+class Gemm_vulkan : public Gemm
{
public:
Gemm_vulkan();
diff --git a/src/layer/vulkan/hardsigmoid_vulkan.h b/src/layer/vulkan/hardsigmoid_vulkan.h
index 23ea48e2959..b0902948c7b 100644
--- a/src/layer/vulkan/hardsigmoid_vulkan.h
+++ b/src/layer/vulkan/hardsigmoid_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSigmoid_vulkan : virtual public HardSigmoid
+class HardSigmoid_vulkan : public HardSigmoid
{
public:
HardSigmoid_vulkan();
diff --git a/src/layer/vulkan/hardswish_vulkan.h b/src/layer/vulkan/hardswish_vulkan.h
index cd5f93f1d76..ab4726877ef 100644
--- a/src/layer/vulkan/hardswish_vulkan.h
+++ b/src/layer/vulkan/hardswish_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSwish_vulkan : virtual public HardSwish
+class HardSwish_vulkan : public HardSwish
{
public:
HardSwish_vulkan();
diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp
index 06bf7b56943..ee73d4bb4ac 100644
--- a/src/layer/vulkan/innerproduct_vulkan.cpp
+++ b/src/layer/vulkan/innerproduct_vulkan.cpp
@@ -154,6 +154,9 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations);
+ weight_data.release();
+ bias_data.release();
+
return 0;
}
@@ -214,7 +217,7 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
}
{
- flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+ flatten = ncnn::create_layer_vulkan(ncnn::LayerType::Flatten);
flatten->vkdev = vkdev;
flatten->bottom_shapes.resize(1);
@@ -361,9 +364,15 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations);
+ weight_data.release();
+ bias_data.release();
+
return 0;
}
+ weight_data.release();
+ bias_data.release();
+
return 0;
}
diff --git a/src/layer/vulkan/innerproduct_vulkan.h b/src/layer/vulkan/innerproduct_vulkan.h
index 4fe138d480f..9002c581c92 100644
--- a/src/layer/vulkan/innerproduct_vulkan.h
+++ b/src/layer/vulkan/innerproduct_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class InnerProduct_vulkan : virtual public InnerProduct
+class InnerProduct_vulkan : public InnerProduct
{
public:
InnerProduct_vulkan();
diff --git a/src/layer/vulkan/instancenorm_vulkan.h b/src/layer/vulkan/instancenorm_vulkan.h
index 6ff269d9fab..943fff65aee 100644
--- a/src/layer/vulkan/instancenorm_vulkan.h
+++ b/src/layer/vulkan/instancenorm_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class InstanceNorm_vulkan : virtual public InstanceNorm
+class InstanceNorm_vulkan : public InstanceNorm
{
public:
InstanceNorm_vulkan();
diff --git a/src/layer/vulkan/interp_vulkan.h b/src/layer/vulkan/interp_vulkan.h
index 94724a78689..5f1752341fe 100644
--- a/src/layer/vulkan/interp_vulkan.h
+++ b/src/layer/vulkan/interp_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Interp_vulkan : virtual public Interp
+class Interp_vulkan : public Interp
{
public:
Interp_vulkan();
diff --git a/src/layer/vulkan/lrn_vulkan.h b/src/layer/vulkan/lrn_vulkan.h
index 30b3f0cee80..ad8cc99348d 100644
--- a/src/layer/vulkan/lrn_vulkan.h
+++ b/src/layer/vulkan/lrn_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class LRN_vulkan : virtual public LRN
+class LRN_vulkan : public LRN
{
public:
LRN_vulkan();
diff --git a/src/layer/vulkan/memorydata_vulkan.h b/src/layer/vulkan/memorydata_vulkan.h
index 7ba21283b75..32655abdcae 100644
--- a/src/layer/vulkan/memorydata_vulkan.h
+++ b/src/layer/vulkan/memorydata_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class MemoryData_vulkan : virtual public MemoryData
+class MemoryData_vulkan : public MemoryData
{
public:
MemoryData_vulkan();
diff --git a/src/layer/vulkan/mish_vulkan.h b/src/layer/vulkan/mish_vulkan.h
index 762e331bfc6..864884382de 100644
--- a/src/layer/vulkan/mish_vulkan.h
+++ b/src/layer/vulkan/mish_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Mish_vulkan : virtual public Mish
+class Mish_vulkan : public Mish
{
public:
Mish_vulkan();
diff --git a/src/layer/vulkan/multiheadattention_vulkan.cpp b/src/layer/vulkan/multiheadattention_vulkan.cpp
index acb28869382..411b81b05e9 100644
--- a/src/layer/vulkan/multiheadattention_vulkan.cpp
+++ b/src/layer/vulkan/multiheadattention_vulkan.cpp
@@ -49,7 +49,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
{
const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
- q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ q_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
q_gemm->vkdev = vkdev;
ncnn::ParamDict pd;
pd.set(0, inv_sqrt_embed_dim_per_head);
@@ -72,10 +72,13 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
weights[1] = q_bias_data;
q_gemm->load_model(ModelBinFromMatArray(weights));
q_gemm->create_pipeline(opt);
+
+ q_weight_data.release();
+ q_bias_data.release();
}
{
- k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ k_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
k_gemm->vkdev = vkdev;
ncnn::ParamDict pd;
pd.set(2, 0); // transA
@@ -96,10 +99,13 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
weights[1] = k_bias_data;
k_gemm->load_model(ModelBinFromMatArray(weights));
k_gemm->create_pipeline(opt);
+
+ k_weight_data.release();
+ k_bias_data.release();
}
{
- v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ v_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
v_gemm->vkdev = vkdev;
ncnn::ParamDict pd;
pd.set(2, 0); // transA
@@ -120,6 +126,9 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
weights[1] = v_bias_data;
v_gemm->load_model(ModelBinFromMatArray(weights));
v_gemm->create_pipeline(opt);
+
+ v_weight_data.release();
+ v_bias_data.release();
}
{
@@ -182,7 +191,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
}
{
- qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+ qk_softmax = ncnn::create_layer_vulkan(ncnn::LayerType::Softmax);
qk_softmax->vkdev = vkdev;
ncnn::ParamDict pd;
pd.set(0, -1);
@@ -193,7 +202,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
}
{
- o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ o_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
o_gemm->vkdev = vkdev;
ncnn::ParamDict pd;
pd.set(2, 1); // transA
@@ -212,6 +221,9 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
weights[1] = out_bias_data;
o_gemm->load_model(ModelBinFromMatArray(weights));
o_gemm->create_pipeline(opt);
+
+ out_weight_data.release();
+ out_bias_data.release();
}
return 0;
diff --git a/src/layer/vulkan/multiheadattention_vulkan.h b/src/layer/vulkan/multiheadattention_vulkan.h
index 49662db47a2..3b77d96db48 100644
--- a/src/layer/vulkan/multiheadattention_vulkan.h
+++ b/src/layer/vulkan/multiheadattention_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class MultiHeadAttention_vulkan : virtual public MultiHeadAttention
+class MultiHeadAttention_vulkan : public MultiHeadAttention
{
public:
MultiHeadAttention_vulkan();
diff --git a/src/layer/vulkan/noop_vulkan.cpp b/src/layer/vulkan/noop_vulkan.cpp
new file mode 100644
index 00000000000..3a59d2613a3
--- /dev/null
+++ b/src/layer/vulkan/noop_vulkan.cpp
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "noop_vulkan.h"
+
+namespace ncnn {
+
+Noop_vulkan::Noop_vulkan()
+{
+ support_vulkan = true;
+ support_image_storage = true;
+}
+
+int Noop_vulkan::forward_inplace(std::vector& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+ return 0;
+}
+
+int Noop_vulkan::forward_inplace(std::vector& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/noop_vulkan.h b/src/layer/vulkan/noop_vulkan.h
new file mode 100644
index 00000000000..84d05d07a80
--- /dev/null
+++ b/src/layer/vulkan/noop_vulkan.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_NOOP_VULKAN_H
+#define LAYER_NOOP_VULKAN_H
+
+#include "noop.h"
+
+namespace ncnn {
+
+class Noop_vulkan : public Noop
+{
+public:
+ Noop_vulkan();
+
+ using Noop::forward;
+ virtual int forward_inplace(std::vector& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+ virtual int forward_inplace(std::vector& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_NOOP_VULKAN_H
diff --git a/src/layer/vulkan/normalize_vulkan.h b/src/layer/vulkan/normalize_vulkan.h
index ca44828df1a..4ad20cc457f 100644
--- a/src/layer/vulkan/normalize_vulkan.h
+++ b/src/layer/vulkan/normalize_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Normalize_vulkan : virtual public Normalize
+class Normalize_vulkan : public Normalize
{
public:
Normalize_vulkan();
diff --git a/src/layer/vulkan/packing_vulkan.h b/src/layer/vulkan/packing_vulkan.h
index 954698f98dd..fb9d1cd154f 100644
--- a/src/layer/vulkan/packing_vulkan.h
+++ b/src/layer/vulkan/packing_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Packing_vulkan : virtual public Packing
+class Packing_vulkan : public Packing
{
public:
Packing_vulkan();
diff --git a/src/layer/vulkan/padding_vulkan.h b/src/layer/vulkan/padding_vulkan.h
index faea7bd9266..bc6a235ea1c 100644
--- a/src/layer/vulkan/padding_vulkan.h
+++ b/src/layer/vulkan/padding_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Padding_vulkan : virtual public Padding
+class Padding_vulkan : public Padding
{
public:
Padding_vulkan();
diff --git a/src/layer/vulkan/permute_vulkan.h b/src/layer/vulkan/permute_vulkan.h
index c9fc6cfdef1..fd073bec245 100644
--- a/src/layer/vulkan/permute_vulkan.h
+++ b/src/layer/vulkan/permute_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Permute_vulkan : virtual public Permute
+class Permute_vulkan : public Permute
{
public:
Permute_vulkan();
diff --git a/src/layer/vulkan/pixelshuffle_vulkan.h b/src/layer/vulkan/pixelshuffle_vulkan.h
index f24e2dd53b1..d0b812f2bb5 100644
--- a/src/layer/vulkan/pixelshuffle_vulkan.h
+++ b/src/layer/vulkan/pixelshuffle_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class PixelShuffle_vulkan : virtual public PixelShuffle
+class PixelShuffle_vulkan : public PixelShuffle
{
public:
PixelShuffle_vulkan();
diff --git a/src/layer/vulkan/pooling_vulkan.cpp b/src/layer/vulkan/pooling_vulkan.cpp
index eeba214ccac..ee7a9093301 100644
--- a/src/layer/vulkan/pooling_vulkan.cpp
+++ b/src/layer/vulkan/pooling_vulkan.cpp
@@ -128,7 +128,7 @@ int Pooling_vulkan::create_pipeline(const Option& _opt)
}
{
- padding = ncnn::create_layer(ncnn::LayerType::Padding);
+ padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
padding->vkdev = vkdev;
padding->bottom_shapes.resize(1);
diff --git a/src/layer/vulkan/pooling_vulkan.h b/src/layer/vulkan/pooling_vulkan.h
index a3529b2708c..a336908d5d7 100644
--- a/src/layer/vulkan/pooling_vulkan.h
+++ b/src/layer/vulkan/pooling_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Pooling_vulkan : virtual public Pooling
+class Pooling_vulkan : public Pooling
{
public:
Pooling_vulkan();
diff --git a/src/layer/vulkan/prelu_vulkan.h b/src/layer/vulkan/prelu_vulkan.h
index a58f7ce00b3..d2bae5eaac6 100644
--- a/src/layer/vulkan/prelu_vulkan.h
+++ b/src/layer/vulkan/prelu_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class PReLU_vulkan : virtual public PReLU
+class PReLU_vulkan : public PReLU
{
public:
PReLU_vulkan();
diff --git a/src/layer/vulkan/priorbox_vulkan.h b/src/layer/vulkan/priorbox_vulkan.h
index 5b11387e0f5..394b12d0fa9 100644
--- a/src/layer/vulkan/priorbox_vulkan.h
+++ b/src/layer/vulkan/priorbox_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class PriorBox_vulkan : virtual public PriorBox
+class PriorBox_vulkan : public PriorBox
{
public:
PriorBox_vulkan();
diff --git a/src/layer/vulkan/relu_vulkan.h b/src/layer/vulkan/relu_vulkan.h
index 7ac8fa76ae0..287781fdaa6 100644
--- a/src/layer/vulkan/relu_vulkan.h
+++ b/src/layer/vulkan/relu_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ReLU_vulkan : virtual public ReLU
+class ReLU_vulkan : public ReLU
{
public:
ReLU_vulkan();
diff --git a/src/layer/vulkan/reorg_vulkan.h b/src/layer/vulkan/reorg_vulkan.h
index 1be2ade3601..f1565486996 100644
--- a/src/layer/vulkan/reorg_vulkan.h
+++ b/src/layer/vulkan/reorg_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Reorg_vulkan : virtual public Reorg
+class Reorg_vulkan : public Reorg
{
public:
Reorg_vulkan();
diff --git a/src/layer/vulkan/reshape_vulkan.cpp b/src/layer/vulkan/reshape_vulkan.cpp
index 567acc6651d..e33efca47cc 100644
--- a/src/layer/vulkan/reshape_vulkan.cpp
+++ b/src/layer/vulkan/reshape_vulkan.cpp
@@ -121,7 +121,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
if (need_permute)
{
{
- permute_wh = ncnn::create_layer(ncnn::LayerType::Permute);
+ permute_wh = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
permute_wh->vkdev = vkdev;
permute_wh->bottom_shapes.resize(1);
@@ -137,7 +137,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
permute_wh->create_pipeline(opt);
}
{
- permute_hwc = ncnn::create_layer(ncnn::LayerType::Permute);
+ permute_hwc = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
permute_hwc->vkdev = vkdev;
permute_hwc->bottom_shapes.resize(1);
@@ -153,7 +153,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
permute_hwc->create_pipeline(opt);
}
{
- permute_dhwc = ncnn::create_layer(ncnn::LayerType::Permute);
+ permute_dhwc = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
permute_dhwc->vkdev = vkdev;
permute_dhwc->bottom_shapes.resize(1);
@@ -171,7 +171,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
if (ndim == 2)
{
- permute_hw = ncnn::create_layer(ncnn::LayerType::Permute);
+ permute_hw = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
permute_hw->vkdev = vkdev;
permute_hw->bottom_shapes.resize(1);
@@ -188,7 +188,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
}
if (ndim == 3)
{
- permute_chw = ncnn::create_layer(ncnn::LayerType::Permute);
+ permute_chw = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
permute_chw->vkdev = vkdev;
permute_chw->bottom_shapes.resize(1);
@@ -205,7 +205,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
}
if (ndim == 4)
{
- permute_cdhw = ncnn::create_layer(ncnn::LayerType::Permute);
+ permute_cdhw = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
permute_cdhw->vkdev = vkdev;
permute_cdhw->bottom_shapes.resize(1);
diff --git a/src/layer/vulkan/reshape_vulkan.h b/src/layer/vulkan/reshape_vulkan.h
index 134ae1b9ece..6b408f79940 100644
--- a/src/layer/vulkan/reshape_vulkan.h
+++ b/src/layer/vulkan/reshape_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Reshape_vulkan : virtual public Reshape
+class Reshape_vulkan : public Reshape
{
public:
Reshape_vulkan();
diff --git a/src/layer/vulkan/scale_vulkan.h b/src/layer/vulkan/scale_vulkan.h
index 867667e3da3..72851030d2d 100644
--- a/src/layer/vulkan/scale_vulkan.h
+++ b/src/layer/vulkan/scale_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Scale_vulkan : virtual public Scale
+class Scale_vulkan : public Scale
{
public:
Scale_vulkan();
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
index 79641acbc40..cf6361e981f 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
@@ -80,10 +80,10 @@ void main()
const int lxd16 = lx / 16; // 0 1
const int lxm16 = lx % 16; // 0 1 2 3 .... 15
- coopmat sum0;
- coopmat sum1;
- coopmat sum2;
- coopmat sum3;
+ coopmat sum0;
+ coopmat sum1;
+ coopmat sum2;
+ coopmat sum3;
if (bias_term == 1)
{
@@ -93,17 +93,24 @@ void main()
coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
+#if NCNN_fp16_arithmetic
+ sum0 = bias0;
+ sum1 = bias0;
+ sum2 = bias1;
+ sum3 = bias1;
+#else
sum0 = coopmat(bias0);
sum1 = coopmat(bias0);
sum2 = coopmat(bias1);
sum3 = coopmat(bias1);
+#endif
}
else
{
- sum0 = coopmat(0.f);
- sum1 = coopmat(0.f);
- sum2 = coopmat(0.f);
- sum3 = coopmat(0.f);
+ sum0 = coopmat(0.f);
+ sum1 = coopmat(0.f);
+ sum2 = coopmat(0.f);
+ sum3 = coopmat(0.f);
}
const int N = psc(c) / 4;
@@ -201,6 +208,12 @@ void main()
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
+#if NCNN_fp16_arithmetic
+ coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum2, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum3, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#else
coopmat sum0_fp16 = coopmat(sum0);
coopmat sum1_fp16 = coopmat(sum1);
coopmat sum2_fp16 = coopmat(sum2);
@@ -210,6 +223,7 @@ void main()
coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#endif
barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
index 3c82d995202..6d9d9ce7b88 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
@@ -82,14 +82,14 @@ void main()
const int lxd8 = lx / 8; // 0 1 2 3
const int lxm8 = lx % 8; // 0 1 2 3 .... 7
- coopmat sum0;
- coopmat sum1;
- coopmat sum2;
- coopmat sum3;
- coopmat sum4;
- coopmat sum5;
- coopmat sum6;
- coopmat sum7;
+ coopmat sum0;
+ coopmat sum1;
+ coopmat sum2;
+ coopmat sum3;
+ coopmat sum4;
+ coopmat sum5;
+ coopmat sum6;
+ coopmat sum7;
if (bias_term == 1)
{
@@ -103,6 +103,16 @@ void main()
coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor);
+#if NCNN_fp16_arithmetic
+ sum0 = bias0;
+ sum1 = bias0;
+ sum2 = bias1;
+ sum3 = bias1;
+ sum4 = bias2;
+ sum5 = bias2;
+ sum6 = bias3;
+ sum7 = bias3;
+#else
sum0 = coopmat(bias0);
sum1 = coopmat(bias0);
sum2 = coopmat(bias1);
@@ -111,17 +121,18 @@ void main()
sum5 = coopmat(bias2);
sum6 = coopmat(bias3);
sum7 = coopmat(bias3);
+#endif
}
else
{
- sum0 = coopmat(0.f);
- sum1 = coopmat(0.f);
- sum2 = coopmat(0.f);
- sum3 = coopmat(0.f);
- sum4 = coopmat(0.f);
- sum5 = coopmat(0.f);
- sum6 = coopmat(0.f);
- sum7 = coopmat(0.f);
+ sum0 = coopmat(0.f);
+ sum1 = coopmat(0.f);
+ sum2 = coopmat(0.f);
+ sum3 = coopmat(0.f);
+ sum4 = coopmat(0.f);
+ sum5 = coopmat(0.f);
+ sum6 = coopmat(0.f);
+ sum7 = coopmat(0.f);
}
const int N = psc(c) / 2;
@@ -247,6 +258,16 @@ void main()
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
+#if NCNN_fp16_arithmetic
+ coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum2, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum3, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum4, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum5, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum6, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum7, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#else
coopmat sum0_fp16 = coopmat(sum0);
coopmat sum1_fp16 = coopmat(sum1);
coopmat sum2_fp16 = coopmat(sum2);
@@ -264,6 +285,7 @@ void main()
coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#endif
barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
index 2c0f57e708c..4ec7f1b3f42 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
@@ -80,10 +80,17 @@ void main()
const int lxd16 = lx / 16; // 0 1
const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+#if NCNN_fp16_arithmetic
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0;
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1;
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2;
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3;
+#else
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
+#endif
if (bias_term == 1)
{
@@ -93,17 +100,31 @@ void main()
coopMatLoadNV(bias0, bias_data, gy, 0, false);
coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);
+#if NCNN_fp16_arithmetic
+ sum0 = bias0;
+ sum1 = bias0;
+ sum2 = bias1;
+ sum3 = bias1;
+#else
sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+#endif
}
else
{
+#if NCNN_fp16_arithmetic
+ sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+ sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+ sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+ sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+#else
sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+#endif
}
const int N = psc(c) / 4;
@@ -201,6 +222,12 @@ void main()
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
+#if NCNN_fp16_arithmetic
+ coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
+ coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
+ coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
+ coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
+#else
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
@@ -210,6 +237,7 @@ void main()
coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+#endif
barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
index 97322e6ed9e..e3580695f9d 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
@@ -82,6 +82,16 @@ void main()
const int lxd8 = lx / 8; // 0 1 2 3
const int lxm8 = lx % 8; // 0 1 2 3 .... 7
+#if NCNN_fp16_arithmetic
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0;
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1;
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2;
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3;
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4;
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5;
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6;
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7;
+#else
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
@@ -90,6 +100,7 @@ void main()
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;
+#endif
if (bias_term == 1)
{
@@ -103,6 +114,16 @@ void main()
coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);
+#if NCNN_fp16_arithmetic
+ sum0 = bias0;
+ sum1 = bias0;
+ sum2 = bias1;
+ sum3 = bias1;
+ sum4 = bias2;
+ sum5 = bias2;
+ sum6 = bias3;
+ sum7 = bias3;
+#else
sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
@@ -111,9 +132,20 @@ void main()
sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
+#endif
}
else
{
+#if NCNN_fp16_arithmetic
+ sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+#else
sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
@@ -122,6 +154,7 @@ void main()
sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+#endif
}
const int N = psc(c) / 2;
@@ -247,6 +280,16 @@ void main()
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
+#if NCNN_fp16_arithmetic
+ coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
+ coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
+ coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
+ coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
+ coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
+ coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
+ coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
+ coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
+#else
fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
@@ -264,6 +307,7 @@ void main()
coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+#endif
barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
index c4a494e917a..4482ed18e10 100644
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
@@ -68,10 +68,10 @@ void main()
const int lxd16 = lx / 16; // 0 1
const int lxm16 = lx % 16; // 0 1 2 3 .... 15
- coopmat sum0 = coopmat(0.f);
- coopmat sum1 = coopmat(0.f);
- coopmat sum2 = coopmat(0.f);
- coopmat sum3 = coopmat(0.f);
+ coopmat sum0 = coopmat(0.f);
+ coopmat sum1 = coopmat(0.f);
+ coopmat sum2 = coopmat(0.f);
+ coopmat sum3 = coopmat(0.f);
const int N = psc(c) / 4;
@@ -168,6 +168,12 @@ void main()
if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
return;
+#if NCNN_fp16_arithmetic
+ coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum2, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum3, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#else
coopmat sum0_fp16 = coopmat(sum0);
coopmat sum1_fp16 = coopmat(sum1);
coopmat sum2_fp16 = coopmat(sum2);
@@ -177,6 +183,7 @@ void main()
coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#endif
barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
index 785c917bbf4..ea5aa316b8c 100644
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
@@ -70,14 +70,14 @@ void main()
const int lxd8 = lx / 8; // 0 1 2 3
const int lxm8 = lx % 8; // 0 1 2 3 .... 7
- coopmat sum0 = coopmat(0.f);
- coopmat sum1 = coopmat(0.f);
- coopmat sum2 = coopmat(0.f);
- coopmat sum3 = coopmat(0.f);
- coopmat sum4 = coopmat(0.f);
- coopmat sum5 = coopmat(0.f);
- coopmat sum6 = coopmat(0.f);
- coopmat sum7 = coopmat(0.f);
+ coopmat sum0 = coopmat(0.f);
+ coopmat sum1 = coopmat(0.f);
+ coopmat sum2 = coopmat(0.f);
+ coopmat sum3 = coopmat(0.f);
+ coopmat sum4 = coopmat(0.f);
+ coopmat sum5 = coopmat(0.f);
+ coopmat sum6 = coopmat(0.f);
+ coopmat sum7 = coopmat(0.f);
const int N = psc(c) / 2;
@@ -202,6 +202,16 @@ void main()
if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
return;
+#if NCNN_fp16_arithmetic
+ coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum2, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum3, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum4, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum5, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum6, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+ coopMatStore(sum7, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#else
coopmat sum0_fp16 = coopmat(sum0);
coopmat sum1_fp16 = coopmat(sum1);
coopmat sum2_fp16 = coopmat(sum2);
@@ -219,6 +229,7 @@ void main()
coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#endif
barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
index bcca39eb615..1cf40a5917d 100644
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
@@ -68,10 +68,17 @@ void main()
const int lxd16 = lx / 16; // 0 1
const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+#if NCNN_fp16_arithmetic
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+#else
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+#endif
const int N = psc(c) / 4;
@@ -168,6 +175,12 @@ void main()
if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
return;
+#if NCNN_fp16_arithmetic
+ coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
+ coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
+ coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
+ coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
+#else
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
@@ -177,6 +190,7 @@ void main()
coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+#endif
barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
index 35d3b4faba5..bcf46eea78c 100644
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
@@ -70,6 +70,16 @@ void main()
const int lxd8 = lx / 8; // 0 1 2 3
const int lxm8 = lx % 8; // 0 1 2 3 .... 7
+#if NCNN_fp16_arithmetic
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+ fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+#else
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
@@ -78,6 +88,7 @@ void main()
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+#endif
const int N = psc(c) / 2;
@@ -202,6 +213,16 @@ void main()
if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
return;
+#if NCNN_fp16_arithmetic
+ coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
+ coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
+ coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
+ coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
+ coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
+ coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
+ coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
+ coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
+#else
fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
@@ -219,6 +240,7 @@ void main()
coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+#endif
barrier();
diff --git a/src/layer/vulkan/shufflechannel_vulkan.h b/src/layer/vulkan/shufflechannel_vulkan.h
index 183e45ddaf7..1cbc706ba02 100644
--- a/src/layer/vulkan/shufflechannel_vulkan.h
+++ b/src/layer/vulkan/shufflechannel_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ShuffleChannel_vulkan : virtual public ShuffleChannel
+class ShuffleChannel_vulkan : public ShuffleChannel
{
public:
ShuffleChannel_vulkan();
diff --git a/src/layer/vulkan/sigmoid_vulkan.h b/src/layer/vulkan/sigmoid_vulkan.h
index 2d244506f4e..1350f6a47d4 100644
--- a/src/layer/vulkan/sigmoid_vulkan.h
+++ b/src/layer/vulkan/sigmoid_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Sigmoid_vulkan : virtual public Sigmoid
+class Sigmoid_vulkan : public Sigmoid
{
public:
Sigmoid_vulkan();
diff --git a/src/layer/vulkan/slice_vulkan.h b/src/layer/vulkan/slice_vulkan.h
index 53793752baa..92f9ad154b1 100644
--- a/src/layer/vulkan/slice_vulkan.h
+++ b/src/layer/vulkan/slice_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Slice_vulkan : virtual public Slice
+class Slice_vulkan : public Slice
{
public:
Slice_vulkan();
diff --git a/src/layer/vulkan/softmax_vulkan.h b/src/layer/vulkan/softmax_vulkan.h
index 35478d2da24..aeff8d40be3 100644
--- a/src/layer/vulkan/softmax_vulkan.h
+++ b/src/layer/vulkan/softmax_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Softmax_vulkan : virtual public Softmax
+class Softmax_vulkan : public Softmax
{
public:
Softmax_vulkan();
diff --git a/src/layer/vulkan/split_vulkan.cpp b/src/layer/vulkan/split_vulkan.cpp
new file mode 100644
index 00000000000..791069cc7d7
--- /dev/null
+++ b/src/layer/vulkan/split_vulkan.cpp
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "split_vulkan.h"
+
+namespace ncnn {
+
+Split_vulkan::Split_vulkan()
+{
+ support_vulkan = true;
+ support_image_storage = true;
+}
+
+int Split_vulkan::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+ const VkMat& bottom_blob = bottom_blobs[0];
+ for (size_t i = 0; i < top_blobs.size(); i++)
+ {
+ top_blobs[i] = bottom_blob;
+ }
+
+ return 0;
+}
+
+int Split_vulkan::forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+ const VkImageMat& bottom_blob = bottom_blobs[0];
+ for (size_t i = 0; i < top_blobs.size(); i++)
+ {
+ top_blobs[i] = bottom_blob;
+ }
+
+ return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/split_vulkan.h b/src/layer/vulkan/split_vulkan.h
new file mode 100644
index 00000000000..8e1998a3a93
--- /dev/null
+++ b/src/layer/vulkan/split_vulkan.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SPLIT_VULKAN_H
+#define LAYER_SPLIT_VULKAN_H
+
+#include "split.h"
+
+namespace ncnn {
+
+class Split_vulkan : public Split
+{
+public:
+ Split_vulkan();
+
+ using Split::forward;
+ virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const;
+ virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SPLIT_VULKAN_H
diff --git a/src/layer/vulkan/swish_vulkan.h b/src/layer/vulkan/swish_vulkan.h
index f8d7c9f7707..a562767cbba 100644
--- a/src/layer/vulkan/swish_vulkan.h
+++ b/src/layer/vulkan/swish_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Swish_vulkan : virtual public Swish
+class Swish_vulkan : public Swish
{
public:
Swish_vulkan();
diff --git a/src/layer/vulkan/tanh_vulkan.h b/src/layer/vulkan/tanh_vulkan.h
index cccb2701483..1926363a0f8 100644
--- a/src/layer/vulkan/tanh_vulkan.h
+++ b/src/layer/vulkan/tanh_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class TanH_vulkan : virtual public TanH
+class TanH_vulkan : public TanH
{
public:
TanH_vulkan();
diff --git a/src/layer/vulkan/unaryop_vulkan.h b/src/layer/vulkan/unaryop_vulkan.h
index c1d99873889..bad5377f9b3 100644
--- a/src/layer/vulkan/unaryop_vulkan.h
+++ b/src/layer/vulkan/unaryop_vulkan.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class UnaryOp_vulkan : virtual public UnaryOp
+class UnaryOp_vulkan : public UnaryOp
{
public:
UnaryOp_vulkan();
diff --git a/src/layer/x86/batchnorm_x86.h b/src/layer/x86/batchnorm_x86.h
index b991e313c3e..7168332a1b3 100644
--- a/src/layer/x86/batchnorm_x86.h
+++ b/src/layer/x86/batchnorm_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BatchNorm_x86 : virtual public BatchNorm
+class BatchNorm_x86 : public BatchNorm
{
public:
BatchNorm_x86();
diff --git a/src/layer/x86/bias_x86.h b/src/layer/x86/bias_x86.h
index 39d1bcef492..ab8e30de56d 100644
--- a/src/layer/x86/bias_x86.h
+++ b/src/layer/x86/bias_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Bias_x86 : virtual public Bias
+class Bias_x86 : public Bias
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/x86/binaryop_x86.h b/src/layer/x86/binaryop_x86.h
index 9f3ebb3cac9..cd3ff12a989 100644
--- a/src/layer/x86/binaryop_x86.h
+++ b/src/layer/x86/binaryop_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BinaryOp_x86 : virtual public BinaryOp
+class BinaryOp_x86 : public BinaryOp
{
public:
BinaryOp_x86();
diff --git a/src/layer/x86/bnll_x86.h b/src/layer/x86/bnll_x86.h
index ac7536b75bf..b3fad45ca7d 100644
--- a/src/layer/x86/bnll_x86.h
+++ b/src/layer/x86/bnll_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class BNLL_x86 : virtual public BNLL
+class BNLL_x86 : public BNLL
{
public:
BNLL_x86();
diff --git a/src/layer/x86/cast_x86.h b/src/layer/x86/cast_x86.h
index bd1ec503382..45b27a8c6ce 100644
--- a/src/layer/x86/cast_x86.h
+++ b/src/layer/x86/cast_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Cast_x86 : virtual public Cast
+class Cast_x86 : public Cast
{
public:
Cast_x86();
diff --git a/src/layer/x86/clip_x86.h b/src/layer/x86/clip_x86.h
index be026777f08..45a4058e90e 100644
--- a/src/layer/x86/clip_x86.h
+++ b/src/layer/x86/clip_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Clip_x86 : virtual public Clip
+class Clip_x86 : public Clip
{
public:
Clip_x86();
diff --git a/src/layer/x86/concat_x86.h b/src/layer/x86/concat_x86.h
index 054d4b784d9..28ff162dbdc 100644
--- a/src/layer/x86/concat_x86.h
+++ b/src/layer/x86/concat_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Concat_x86 : virtual public Concat
+class Concat_x86 : public Concat
{
public:
Concat_x86();
diff --git a/src/layer/x86/convolution1d_x86.cpp b/src/layer/x86/convolution1d_x86.cpp
index e7df16b8316..905db18b728 100644
--- a/src/layer/x86/convolution1d_x86.cpp
+++ b/src/layer/x86/convolution1d_x86.cpp
@@ -43,6 +43,8 @@ int Convolution1D_x86::create_pipeline(const Option& /*opt*/)
convolution1d_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w);
+ weight_data.release();
+
return 0;
}
@@ -126,7 +128,7 @@ int Convolution1D_x86::forward(const std::vector& bottom_blobs, std::vector
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
ncnn::ParamDict pd;
pd.set(0, _num_output);
diff --git a/src/layer/x86/convolution1d_x86.h b/src/layer/x86/convolution1d_x86.h
index ec1782b7063..497b34e5962 100644
--- a/src/layer/x86/convolution1d_x86.h
+++ b/src/layer/x86/convolution1d_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Convolution1D_x86 : virtual public Convolution1D
+class Convolution1D_x86 : public Convolution1D
{
public:
Convolution1D_x86();
diff --git a/src/layer/x86/convolution_3x3_winograd_int8.h b/src/layer/x86/convolution_3x3_winograd_int8.h
index 8c7b891b0dd..94ea79d4540 100644
--- a/src/layer/x86/convolution_3x3_winograd_int8.h
+++ b/src/layer/x86/convolution_3x3_winograd_int8.h
@@ -3544,10 +3544,10 @@ static inline void conv3x3s1_winograd23_transform_input_tile_int8(const Mat& bot
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, sizeof(signed char))));
- if (tj * 2 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), sizeof(signed char))));
- if (tj * 2 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), sizeof(signed char))));
- if (tj * 2 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), sizeof(signed char))));
+ _r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, 1)));
+ if (tj * 2 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), 1)));
+ if (tj * 2 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), 1)));
+ if (tj * 2 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), 1)));
}
}
@@ -3653,28 +3653,28 @@ static inline void conv3x3s1_winograd23_transform_input_tile_int8(const Mat& bot
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char))));
- if (tj * 2 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char))));
- if (tj * 2 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char))));
- if (tj * 2 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char))));
+ _r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1)));
+ if (tj * 2 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1)));
+ if (tj * 2 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1)));
+ if (tj * 2 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1)));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1), _sindex88);
_r0 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1)));
if (tj * 2 + 1 < w)
{
- __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1), _sindex88);
_r1 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1)));
}
if (tj * 2 + 2 < w)
{
- __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1), _sindex88);
_r2 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1)));
}
if (tj * 2 + 3 < w)
{
- __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1), _sindex88);
_r3 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val3_32, 0), _mm256_extracti128_si256(_val3_32, 1)));
}
#endif // __AVX512F__
@@ -4768,12 +4768,12 @@ static inline void conv3x3s1_winograd43_transform_input_tile_int8(const Mat& bot
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, sizeof(signed char))));
- if (tj * 4 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), sizeof(signed char))));
- if (tj * 4 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), sizeof(signed char))));
- if (tj * 4 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), sizeof(signed char))));
- if (tj * 4 + 4 < w) _r4 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 4), sizeof(signed char))));
- if (tj * 4 + 5 < w) _r5 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 5), sizeof(signed char))));
+ _r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, 1)));
+ if (tj * 4 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), 1)));
+ if (tj * 4 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), 1)));
+ if (tj * 4 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), 1)));
+ if (tj * 4 + 4 < w) _r4 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 4), 1)));
+ if (tj * 4 + 5 < w) _r5 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 5), 1)));
}
}
@@ -4919,40 +4919,40 @@ static inline void conv3x3s1_winograd43_transform_input_tile_int8(const Mat& bot
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char))));
- if (tj * 4 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char))));
- if (tj * 4 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char))));
- if (tj * 4 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char))));
- if (tj * 4 + 4 < w) _r4 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, sizeof(signed char))));
- if (tj * 4 + 5 < w) _r5 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, sizeof(signed char))));
+ _r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1)));
+ if (tj * 4 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1)));
+ if (tj * 4 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1)));
+ if (tj * 4 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1)));
+ if (tj * 4 + 4 < w) _r4 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, 1)));
+ if (tj * 4 + 5 < w) _r5 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, 1)));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1), _sindex88);
_r0 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1)));
if (tj * 4 + 1 < w)
{
- __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1), _sindex88);
_r1 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1)));
}
if (tj * 4 + 2 < w)
{
- __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1), _sindex88);
_r2 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1)));
}
if (tj * 4 + 3 < w)
{
- __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1), _sindex88);
_r3 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val3_32, 0), _mm256_extracti128_si256(_val3_32, 1)));
}
if (tj * 4 + 4 < w)
{
- __m256i _val4_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val4_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, 1), _sindex88);
_r4 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val4_32, 0), _mm256_extracti128_si256(_val4_32, 1)));
}
if (tj * 4 + 5 < w)
{
- __m256i _val5_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val5_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, 1), _sindex88);
_r5 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val5_32, 0), _mm256_extracti128_si256(_val5_32, 1)));
}
#endif // __AVX512F__
diff --git a/src/layer/x86/convolution_im2col_gemm_int8.h b/src/layer/x86/convolution_im2col_gemm_int8.h
index ef8fe0ab920..e72dd8882dd 100644
--- a/src/layer/x86/convolution_im2col_gemm_int8.h
+++ b/src/layer/x86/convolution_im2col_gemm_int8.h
@@ -4871,7 +4871,6 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
_sum0 = _mm512_unpacklo_epi64(_tmp0, _tmp1);
_sum1 = _mm512_unpackhi_epi64(_tmp0, _tmp1);
- _sum0 = _sum0;
_sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
// 0123 4567 89ab cdef x 0
diff --git a/src/layer/x86/convolution_packed_int8.h b/src/layer/x86/convolution_packed_int8.h
index 6217f8bf5bd..46c03f0ca9b 100644
--- a/src/layer/x86/convolution_packed_int8.h
+++ b/src/layer/x86/convolution_packed_int8.h
@@ -169,22 +169,22 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
- __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
- __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), sizeof(signed char)));
- __m128i _w2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr2 + k), sizeof(signed char)));
- __m128i _w3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr3 + k), sizeof(signed char)));
- __m128i _w4 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr4 + k), sizeof(signed char)));
- __m128i _w5 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr5 + k), sizeof(signed char)));
- __m128i _w6 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr6 + k), sizeof(signed char)));
- __m128i _w7 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr7 + k), sizeof(signed char)));
- __m128i _w8 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr8 + k), sizeof(signed char)));
- __m128i _w9 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr9 + k), sizeof(signed char)));
- __m128i _wa = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptra + k), sizeof(signed char)));
- __m128i _wb = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrb + k), sizeof(signed char)));
- __m128i _wc = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrc + k), sizeof(signed char)));
- __m128i _wd = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrd + k), sizeof(signed char)));
- __m128i _we = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptre + k), sizeof(signed char)));
- __m128i _wf = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrf + k), sizeof(signed char)));
+ __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
+ __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), 1));
+ __m128i _w2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr2 + k), 1));
+ __m128i _w3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr3 + k), 1));
+ __m128i _w4 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr4 + k), 1));
+ __m128i _w5 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr5 + k), 1));
+ __m128i _w6 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr6 + k), 1));
+ __m128i _w7 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr7 + k), 1));
+ __m128i _w8 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr8 + k), 1));
+ __m128i _w9 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr9 + k), 1));
+ __m128i _wa = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptra + k), 1));
+ __m128i _wb = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrb + k), 1));
+ __m128i _wc = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrc + k), 1));
+ __m128i _wd = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrd + k), 1));
+ __m128i _we = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptre + k), 1));
+ __m128i _wf = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrf + k), 1));
transpose8x16_epi16(_w0, _w1, _w2, _w3, _w4, _w5, _w6, _w7, _w8, _w9, _wa, _wb, _wc, _wd, _we, _wf);
@@ -231,22 +231,22 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
- __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char)));
- __m128i _w1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr1 + k), _vindex, sizeof(signed char)));
- __m128i _w2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr2 + k), _vindex, sizeof(signed char)));
- __m128i _w3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr3 + k), _vindex, sizeof(signed char)));
- __m128i _w4 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr4 + k), _vindex, sizeof(signed char)));
- __m128i _w5 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr5 + k), _vindex, sizeof(signed char)));
- __m128i _w6 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr6 + k), _vindex, sizeof(signed char)));
- __m128i _w7 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr7 + k), _vindex, sizeof(signed char)));
- __m128i _w8 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr8 + k), _vindex, sizeof(signed char)));
- __m128i _w9 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr9 + k), _vindex, sizeof(signed char)));
- __m128i _wa = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptra + k), _vindex, sizeof(signed char)));
- __m128i _wb = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrb + k), _vindex, sizeof(signed char)));
- __m128i _wc = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrc + k), _vindex, sizeof(signed char)));
- __m128i _wd = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrd + k), _vindex, sizeof(signed char)));
- __m128i _we = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptre + k), _vindex, sizeof(signed char)));
- __m128i _wf = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrf + k), _vindex, sizeof(signed char)));
+ __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1));
+ __m128i _w1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr1 + k), _vindex, 1));
+ __m128i _w2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr2 + k), _vindex, 1));
+ __m128i _w3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr3 + k), _vindex, 1));
+ __m128i _w4 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr4 + k), _vindex, 1));
+ __m128i _w5 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr5 + k), _vindex, 1));
+ __m128i _w6 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr6 + k), _vindex, 1));
+ __m128i _w7 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr7 + k), _vindex, 1));
+ __m128i _w8 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr8 + k), _vindex, 1));
+ __m128i _w9 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr9 + k), _vindex, 1));
+ __m128i _wa = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptra + k), _vindex, 1));
+ __m128i _wb = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrb + k), _vindex, 1));
+ __m128i _wc = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrc + k), _vindex, 1));
+ __m128i _wd = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrd + k), _vindex, 1));
+ __m128i _we = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptre + k), _vindex, 1));
+ __m128i _wf = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrf + k), _vindex, 1));
__m128i _w08 = _mm_unpacklo_epi64(_w0, _w8);
__m128i _w19 = _mm_unpacklo_epi64(_w1, _w9);
@@ -296,8 +296,8 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
__m512i _vindex = _mm512_inserti64x4(_mm512_castsi256_si512(_vindex01), _vindex23, 1);
for (int k = 0; k < maxk; k++)
{
- __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
- __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr8 + k), sizeof(signed char)));
+ __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
+ __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr8 + k), 1));
_mm_storeu_si128((__m128i*)g00, _w0);
_mm_storeu_si128((__m128i*)(g00 + 16), _w1);
@@ -313,7 +313,7 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(inch * maxk));
for (int k = 0; k < maxk; k++)
{
- __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
+ __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
_mm_storeu_si128((__m128i*)g00, _w0);
g00 += 16;
}
@@ -346,14 +346,14 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
- __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
- __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), sizeof(signed char)));
- __m128i _w2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr2 + k), sizeof(signed char)));
- __m128i _w3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr3 + k), sizeof(signed char)));
- __m128i _w4 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr4 + k), sizeof(signed char)));
- __m128i _w5 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr5 + k), sizeof(signed char)));
- __m128i _w6 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr6 + k), sizeof(signed char)));
- __m128i _w7 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr7 + k), sizeof(signed char)));
+ __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
+ __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), 1));
+ __m128i _w2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr2 + k), 1));
+ __m128i _w3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr3 + k), 1));
+ __m128i _w4 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr4 + k), 1));
+ __m128i _w5 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr5 + k), 1));
+ __m128i _w6 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr6 + k), 1));
+ __m128i _w7 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr7 + k), 1));
transpose8x8_epi16(_w0, _w1, _w2, _w3, _w4, _w5, _w6, _w7);
@@ -446,10 +446,10 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
#if __AVX512F__
- __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
+ __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
#else
- __m256i _w01 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, sizeof(signed char)), _sindex88);
- __m256i _w23 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr4 + k), _vindex01, sizeof(signed char)), _sindex88);
+ __m256i _w01 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, 1), _sindex88);
+ __m256i _w23 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr4 + k), _vindex01, 1), _sindex88);
__m128i _w01xx = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w01, 0), _mm256_extracti128_si256(_w01, 1));
__m128i _w23xx = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w23, 0), _mm256_extracti128_si256(_w23, 1));
__m128i _w0 = _mm_unpacklo_epi64(_w01xx, _w23xx);
@@ -471,7 +471,7 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
#endif
for (int k = 0; k < maxk; k++)
{
- __m256i _w32 = _mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char));
+ __m256i _w32 = _mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1);
#if __AVX512F__
__m128i _w0 = _mm256_cvtepi32_epi8(_w32);
#else
@@ -583,10 +583,10 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
#if __AVX512F__
- __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, sizeof(signed char)));
+ __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, 1));
_mm_storel_epi64((__m128i*)g00, _w0);
#elif __AVX2__
- __m256i _w01 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, sizeof(signed char)), _sindex88);
+ __m256i _w01 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, 1), _sindex88);
__m128i _w0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w01, 0), _mm256_extracti128_si256(_w01, 1));
_mm_storel_epi64((__m128i*)g00, _w0);
#else
@@ -624,10 +624,10 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
#if __AVX512F__
- __m128i _w0 = _mm_cvtepi32_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char)));
+ __m128i _w0 = _mm_cvtepi32_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1));
_mm_store_ss((float*)g00, _mm_castsi128_ps(_w0));
#elif __AVX2__
- __m128i _w0 = _mm_shuffle_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char)), _sindex8);
+ __m128i _w0 = _mm_shuffle_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1), _sindex8);
_mm_store_ss((float*)g00, _mm_castsi128_ps(_w0));
#else
const signed char* k0 = kptr0 + k;
@@ -670,8 +670,8 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
- __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
- __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), sizeof(signed char)));
+ __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
+ __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), 1));
_mm_storeu_si128((__m128i*)g00, _w0);
_mm_storeu_si128((__m128i*)(g00 + 16), _w1);
@@ -699,11 +699,11 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
#if __AVX512F__
- __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
+ __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
_mm_storeu_si128((__m128i*)g00, _w0);
#elif __AVX2__
- __m256i _w00 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex0, sizeof(signed char)), _sindex88);
- __m256i _w11 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr1 + k), _vindex0, sizeof(signed char)), _sindex88);
+ __m256i _w00 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex0, 1), _sindex88);
+ __m256i _w11 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr1 + k), _vindex0, 1), _sindex88);
__m128i _w0x = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w00, 0), _mm256_extracti128_si256(_w00, 1));
__m128i _w1x = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w11, 0), _mm256_extracti128_si256(_w11, 1));
__m128i _w0 = _mm_unpacklo_epi64(_w0x, _w1x);
@@ -748,10 +748,10 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
#if __AVX512F__
- __m128i _w0 = _mm_cvtepi32_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char)));
+ __m128i _w0 = _mm_cvtepi32_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1));
_mm_store_ss((float*)g00, _mm_castsi128_ps(_w0));
#elif __AVX2__
- __m128i _w0 = _mm_shuffle_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char)), _sindex8);
+ __m128i _w0 = _mm_shuffle_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1), _sindex8);
_mm_store_ss((float*)g00, _mm_castsi128_ps(_w0));
#else
const signed char* k0 = kptr0 + k;
@@ -805,7 +805,7 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
- __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr + k), sizeof(signed char)));
+ __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr + k), 1));
_mm_storeu_si128((__m128i*)g00, _w0);
g00 += 16;
@@ -827,12 +827,12 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
for (int k = 0; k < maxk; k++)
{
#if __AVX512F__
- __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr + k), _vindex, sizeof(signed char)));
+ __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr + k), _vindex, 1));
_mm_storel_epi64((__m128i*)g00, _w0);
g00 += 8;
#elif __AVX2__
- __m256i _w00 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr + k), _vindex, sizeof(signed char)), _sindex88);
+ __m256i _w00 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr + k), _vindex, 1), _sindex88);
__m128i _w0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w00, 0), _mm256_extracti128_si256(_w00, 1));
_mm_storel_epi64((__m128i*)g00, _w0);
@@ -1029,10 +1029,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
- _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
- _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), sizeof(signed char)));
- _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+ _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
+ _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), 1));
+ _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), 1));
}
__m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -1163,10 +1163,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
- _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
- _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)));
- _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+ _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
+ _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1));
+ _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1));
}
_r0 = _mm_cvtepi8_epi16(_r0);
@@ -1394,8 +1394,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
- _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+ _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
}
__m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -1480,8 +1480,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
- _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+ _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
}
_r0 = _mm_cvtepi8_epi16(_r0);
@@ -1649,7 +1649,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
}
__m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -1711,7 +1711,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
}
_r0 = _mm_cvtepi8_epi16(_r0);
@@ -1910,10 +1910,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
- _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
- _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), sizeof(signed char)));
- _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+ _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
+ _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), 1));
+ _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), 1));
}
__m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -2028,17 +2028,17 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
- _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
- _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)));
- _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+ _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
+ _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1));
+ _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+ __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
+ __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1), _sindex88);
+ __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
_r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
_r2 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1));
@@ -2317,8 +2317,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
- _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+ _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
}
__m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -2399,13 +2399,13 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
- _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+ _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+ __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
_r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
#endif // __AVX512F__
@@ -2601,7 +2601,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
}
__m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -2666,11 +2666,11 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val32, 0), _mm256_extracti128_si256(_val32, 1));
#endif // __AVX512F__
#else
@@ -2882,10 +2882,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
- _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
- _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), sizeof(signed char)));
- _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+ _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
+ _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), 1));
+ _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), 1));
}
__m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -2997,17 +2997,17 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
- _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
- _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)));
- _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+ _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
+ _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1));
+ _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+ __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
+ __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1), _sindex88);
+ __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
_r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
_r2 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1));
@@ -3337,8 +3337,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
- _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+ _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
}
__m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -3422,13 +3422,13 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
- _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+ _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+ __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
_r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
#endif // __AVX512F__
@@ -3669,7 +3669,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
}
__m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -3733,11 +3733,11 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val32, 0), _mm256_extracti128_si256(_val32, 1));
#endif // __AVX512F__
#else
@@ -3986,10 +3986,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
- _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
- _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), sizeof(signed char)));
- _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+ _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
+ _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), 1));
+ _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), 1));
}
__m256i _val0 = _mm256_cvtepi8_epi16(_r0);
@@ -4077,17 +4077,17 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
- _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
- _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)));
- _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+ _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
+ _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1));
+ _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+ __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
+ __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1), _sindex88);
+ __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
_r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
_r2 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1));
@@ -4324,8 +4324,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
- _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+ _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
}
__m256i _val0 = _mm256_cvtepi8_epi16(_r0);
@@ -4390,13 +4390,13 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
- _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+ _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+ __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
_r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
#endif // __AVX512F__
@@ -4562,7 +4562,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
}
__m256i _val = _mm256_cvtepi8_epi16(_r0);
@@ -4612,11 +4612,11 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val32, 0), _mm256_extracti128_si256(_val32, 1));
#endif // __AVX512F__
#else
@@ -4790,10 +4790,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
- _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
- _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), sizeof(signed char)));
- _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+ _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
+ _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), 1));
+ _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), 1));
}
__m256i _val0 = _mm256_cvtepi8_epi16(_r0);
@@ -4861,18 +4861,18 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
- _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
- _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)));
- _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+ _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
+ _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1));
+ _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+ __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
+ __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1), _sindex88);
+ __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
_r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
_r2 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1));
@@ -5071,8 +5071,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
- _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+ _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
}
__m256i _val0 = _mm256_cvtepi8_epi16(_r0);
@@ -5124,14 +5124,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
- _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+ _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
- __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+ __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
_r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
#endif
@@ -5264,7 +5264,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
- _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
+ _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
}
__m256i _val = _mm256_cvtepi8_epi16(_r0);
@@ -5306,11 +5306,11 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
- _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
+ _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
- __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
+ __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
_r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val32, 0), _mm256_extracti128_si256(_val32, 1));
#endif // __AVX512F__
#else
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 6e828ff0d21..c1f354ea6de 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -297,7 +297,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
if (!opt.use_packing_layout && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
{
- convolution_dilation1 = ncnn::create_layer(ncnn::LayerType::Convolution);
+ convolution_dilation1 = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
// set param
ncnn::ParamDict pd;
@@ -334,10 +334,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
convolution_dilation1->create_pipeline(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -454,10 +451,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -469,7 +463,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
{
const int maxk = kernel_w * kernel_h;
- gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 0); // transA
@@ -548,10 +542,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -1182,7 +1173,7 @@ int Convolution_x86::forward(const std::vector& bottom_blobs, std::vector& bottom_blobs, std:
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
ncnn::ParamDict pd;
pd.set(0, _num_output);
@@ -849,16 +843,15 @@ int ConvolutionDepthWise_x86::create_pipeline_int8_x86(const Option& opt)
weight_data_tm = weight_data;
}
+ weight_data.release();
+
return 0;
}
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/x86/convolutiondepthwise_x86.h b/src/layer/x86/convolutiondepthwise_x86.h
index 6fe066e5bed..1fedb119bd3 100644
--- a/src/layer/x86/convolutiondepthwise_x86.h
+++ b/src/layer/x86/convolutiondepthwise_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ConvolutionDepthWise_x86 : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_x86 : public ConvolutionDepthWise
{
public:
ConvolutionDepthWise_x86();
diff --git a/src/layer/x86/crop_x86.h b/src/layer/x86/crop_x86.h
index e7e3d140fc5..ba0fc1b607e 100644
--- a/src/layer/x86/crop_x86.h
+++ b/src/layer/x86/crop_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Crop_x86 : virtual public Crop
+class Crop_x86 : public Crop
{
public:
Crop_x86();
diff --git a/src/layer/x86/deconvolution_x86.cpp b/src/layer/x86/deconvolution_x86.cpp
index 46bdca2a397..6a94104a43d 100644
--- a/src/layer/x86/deconvolution_x86.cpp
+++ b/src/layer/x86/deconvolution_x86.cpp
@@ -94,7 +94,7 @@ int Deconvolution_x86::create_pipeline(const Option& opt)
{
const int maxk = kernel_w * kernel_h;
- gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 1); // transA
@@ -193,10 +193,7 @@ int Deconvolution_x86::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -694,7 +691,7 @@ int Deconvolution_x86::forward(const std::vector& bottom_blobs, std::vector
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
ncnn::ParamDict pd;
pd.set(0, _num_output);
diff --git a/src/layer/x86/deconvolution_x86.h b/src/layer/x86/deconvolution_x86.h
index 4951870bcd0..66c23eef3f3 100644
--- a/src/layer/x86/deconvolution_x86.h
+++ b/src/layer/x86/deconvolution_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Deconvolution_x86 : virtual public Deconvolution
+class Deconvolution_x86 : public Deconvolution
{
public:
Deconvolution_x86();
diff --git a/src/layer/x86/deconvolutiondepthwise_x86.cpp b/src/layer/x86/deconvolutiondepthwise_x86.cpp
index 43a573a64ef..4a1e89d26a8 100644
--- a/src/layer/x86/deconvolutiondepthwise_x86.cpp
+++ b/src/layer/x86/deconvolutiondepthwise_x86.cpp
@@ -109,16 +109,15 @@ int DeconvolutionDepthWise_x86::create_pipeline(const Option& opt)
weight_data_tm = weight_data_transposed;
}
+ weight_data.release();
+
return 0;
}
// group convolution
create_group_ops(opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -146,7 +145,7 @@ int DeconvolutionDepthWise_x86::create_group_ops(const Option& opt)
if (bias_term)
bias_data_g = bias_data.range(num_output_g * g, num_output_g);
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
// set param
ncnn::ParamDict pd;
@@ -641,7 +640,7 @@ int DeconvolutionDepthWise_x86::forward(const std::vector& bottom_blobs, st
bias_data_flattened.elempack = 1;
}
- ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+ ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
ncnn::ParamDict pd;
pd.set(0, _num_output);
diff --git a/src/layer/x86/deconvolutiondepthwise_x86.h b/src/layer/x86/deconvolutiondepthwise_x86.h
index 07fb5e54f9b..9c9e54cccf4 100644
--- a/src/layer/x86/deconvolutiondepthwise_x86.h
+++ b/src/layer/x86/deconvolutiondepthwise_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class DeconvolutionDepthWise_x86 : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_x86 : public DeconvolutionDepthWise
{
public:
DeconvolutionDepthWise_x86();
diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp
index 076e56f7e64..8fc7bdf2855 100644
--- a/src/layer/x86/deformableconv2d_x86.cpp
+++ b/src/layer/x86/deformableconv2d_x86.cpp
@@ -134,7 +134,7 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt)
{
const int maxk = kernel_w * kernel_h;
- gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 0); // transA
@@ -203,10 +203,7 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt)
deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/x86/deformableconv2d_x86.h b/src/layer/x86/deformableconv2d_x86.h
index e5ab4e08c99..66cce21ab49 100644
--- a/src/layer/x86/deformableconv2d_x86.h
+++ b/src/layer/x86/deformableconv2d_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class DeformableConv2D_x86 : virtual public DeformableConv2D
+class DeformableConv2D_x86 : public DeformableConv2D
{
public:
DeformableConv2D_x86();
diff --git a/src/layer/x86/dequantize_x86.h b/src/layer/x86/dequantize_x86.h
index 2d8a6a22b0a..52bfcaed22e 100644
--- a/src/layer/x86/dequantize_x86.h
+++ b/src/layer/x86/dequantize_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Dequantize_x86 : virtual public Dequantize
+class Dequantize_x86 : public Dequantize
{
public:
Dequantize_x86();
diff --git a/src/layer/x86/dropout_x86.h b/src/layer/x86/dropout_x86.h
index 959c9889e34..d44a8987162 100644
--- a/src/layer/x86/dropout_x86.h
+++ b/src/layer/x86/dropout_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Dropout_x86 : virtual public Dropout
+class Dropout_x86 : public Dropout
{
public:
Dropout_x86();
diff --git a/src/layer/x86/eltwise_x86.h b/src/layer/x86/eltwise_x86.h
index 0f4eac064e0..e941817a303 100644
--- a/src/layer/x86/eltwise_x86.h
+++ b/src/layer/x86/eltwise_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Eltwise_x86 : virtual public Eltwise
+class Eltwise_x86 : public Eltwise
{
public:
Eltwise_x86();
diff --git a/src/layer/x86/elu_x86.h b/src/layer/x86/elu_x86.h
index cd49c4f7d5a..6da00490d21 100644
--- a/src/layer/x86/elu_x86.h
+++ b/src/layer/x86/elu_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ELU_x86 : virtual public ELU
+class ELU_x86 : public ELU
{
public:
ELU_x86();
diff --git a/src/layer/x86/flatten_x86.h b/src/layer/x86/flatten_x86.h
index fcd512ae194..29820121695 100644
--- a/src/layer/x86/flatten_x86.h
+++ b/src/layer/x86/flatten_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Flatten_x86 : virtual public Flatten
+class Flatten_x86 : public Flatten
{
public:
Flatten_x86();
diff --git a/src/layer/x86/gelu_x86.h b/src/layer/x86/gelu_x86.h
index 75d821bfd45..ba4b43e65ec 100644
--- a/src/layer/x86/gelu_x86.h
+++ b/src/layer/x86/gelu_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class GELU_x86 : virtual public GELU
+class GELU_x86 : public GELU
{
public:
GELU_x86();
diff --git a/src/layer/x86/gemm_x86.cpp b/src/layer/x86/gemm_x86.cpp
index 19cd7ebc09a..4ab37836a43 100644
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -7235,10 +7235,7 @@ int Gemm_x86::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- A_data.release();
- }
+ A_data.release();
}
if (constantB)
@@ -7282,10 +7279,7 @@ int Gemm_x86::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- B_data.release();
- }
+ B_data.release();
}
if (constantC && constant_broadcast_type_C != -1)
@@ -7321,10 +7315,7 @@ int Gemm_x86::create_pipeline(const Option& opt)
CT_data = C2;
}
- if (opt.lightmode)
- {
- C_data.release();
- }
+ C_data.release();
}
if (constantA || constantB || constantC)
diff --git a/src/layer/x86/gemm_x86.h b/src/layer/x86/gemm_x86.h
index ef14872d76e..6f8eb4a82bf 100644
--- a/src/layer/x86/gemm_x86.h
+++ b/src/layer/x86/gemm_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Gemm_x86 : virtual public Gemm
+class Gemm_x86 : public Gemm
{
public:
Gemm_x86();
diff --git a/src/layer/x86/gridsample_x86.h b/src/layer/x86/gridsample_x86.h
index 826414eefc9..caf7c7c50c3 100644
--- a/src/layer/x86/gridsample_x86.h
+++ b/src/layer/x86/gridsample_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class GridSample_x86 : virtual public GridSample
+class GridSample_x86 : public GridSample
{
public:
GridSample_x86();
diff --git a/src/layer/x86/groupnorm_x86.h b/src/layer/x86/groupnorm_x86.h
index c3085e3622e..151884e5455 100644
--- a/src/layer/x86/groupnorm_x86.h
+++ b/src/layer/x86/groupnorm_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class GroupNorm_x86 : virtual public GroupNorm
+class GroupNorm_x86 : public GroupNorm
{
public:
GroupNorm_x86();
diff --git a/src/layer/x86/hardsigmoid_x86.h b/src/layer/x86/hardsigmoid_x86.h
index b111608bb87..418a8dc941f 100644
--- a/src/layer/x86/hardsigmoid_x86.h
+++ b/src/layer/x86/hardsigmoid_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSigmoid_x86 : virtual public HardSigmoid
+class HardSigmoid_x86 : public HardSigmoid
{
public:
HardSigmoid_x86();
diff --git a/src/layer/x86/hardswish_x86.h b/src/layer/x86/hardswish_x86.h
index 37fd42a513c..4fe521ea47d 100644
--- a/src/layer/x86/hardswish_x86.h
+++ b/src/layer/x86/hardswish_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class HardSwish_x86 : virtual public HardSwish
+class HardSwish_x86 : public HardSwish
{
public:
HardSwish_x86();
diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp
index 67bf0cca548..dee07d1de64 100644
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -53,7 +53,7 @@ int InnerProduct_x86::create_pipeline(const Option& opt)
{
// if (opt.use_packing_layout)
{
- flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+ flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
ncnn::ParamDict pd;
@@ -80,10 +80,7 @@ int InnerProduct_x86::create_pipeline(const Option& opt)
innerproduct_transform_kernel_sse(weight_data, weight_data_tm, num_input, num_output, opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -178,10 +175,7 @@ int InnerProduct_x86::create_pipeline_fp16s(const Option& opt)
innerproduct_transform_kernel_fp16s_sse(weight_data, weight_data_tm, num_input, num_output, opt);
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
@@ -287,10 +281,7 @@ int InnerProduct_x86::create_pipeline_int8_x86(const Option& opt)
scale_in_data[p] = scale_in;
}
- if (opt.lightmode)
- {
- weight_data.release();
- }
+ weight_data.release();
return 0;
}
diff --git a/src/layer/x86/innerproduct_x86.h b/src/layer/x86/innerproduct_x86.h
index 211131e6132..19da245f32f 100644
--- a/src/layer/x86/innerproduct_x86.h
+++ b/src/layer/x86/innerproduct_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class InnerProduct_x86 : virtual public InnerProduct
+class InnerProduct_x86 : public InnerProduct
{
public:
InnerProduct_x86();
diff --git a/src/layer/x86/interp_x86.h b/src/layer/x86/interp_x86.h
index 6f91b950ef5..46fcde6f221 100644
--- a/src/layer/x86/interp_x86.h
+++ b/src/layer/x86/interp_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Interp_x86 : virtual public Interp
+class Interp_x86 : public Interp
{
public:
Interp_x86();
diff --git a/src/layer/x86/layernorm_x86.h b/src/layer/x86/layernorm_x86.h
index 42eb551ed95..7e8ec05894c 100644
--- a/src/layer/x86/layernorm_x86.h
+++ b/src/layer/x86/layernorm_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class LayerNorm_x86 : virtual public LayerNorm
+class LayerNorm_x86 : public LayerNorm
{
public:
LayerNorm_x86();
diff --git a/src/layer/x86/lrn_x86.h b/src/layer/x86/lrn_x86.h
index 3fe791872c6..9aa85367cda 100644
--- a/src/layer/x86/lrn_x86.h
+++ b/src/layer/x86/lrn_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class LRN_x86 : virtual public LRN
+class LRN_x86 : public LRN
{
public:
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/x86/lstm_x86.cpp b/src/layer/x86/lstm_x86.cpp
index 6ba218e53d3..5d693648f44 100644
--- a/src/layer/x86/lstm_x86.cpp
+++ b/src/layer/x86/lstm_x86.cpp
@@ -182,12 +182,9 @@ int LSTM_x86::create_pipeline(const Option& opt)
}
}
- if (opt.lightmode)
- {
- weight_xc_data.release();
- bias_c_data.release();
- weight_hc_data.release();
- }
+ weight_xc_data.release();
+ bias_c_data.release();
+ weight_hc_data.release();
return 0;
}
diff --git a/src/layer/x86/lstm_x86.h b/src/layer/x86/lstm_x86.h
index cab7d7e32fa..1dc56d45e03 100644
--- a/src/layer/x86/lstm_x86.h
+++ b/src/layer/x86/lstm_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class LSTM_x86 : virtual public LSTM
+class LSTM_x86 : public LSTM
{
public:
LSTM_x86();
diff --git a/src/layer/x86/matmul_x86.cpp b/src/layer/x86/matmul_x86.cpp
index 2c829ea1848..d0afe81f76b 100644
--- a/src/layer/x86/matmul_x86.cpp
+++ b/src/layer/x86/matmul_x86.cpp
@@ -25,7 +25,7 @@ MatMul_x86::MatMul_x86()
int MatMul_x86::create_pipeline(const Option& opt)
{
- gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 0); // transA
diff --git a/src/layer/x86/matmul_x86.h b/src/layer/x86/matmul_x86.h
index 12311e7a94d..afbb85a7883 100644
--- a/src/layer/x86/matmul_x86.h
+++ b/src/layer/x86/matmul_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class MatMul_x86 : virtual public MatMul
+class MatMul_x86 : public MatMul
{
public:
MatMul_x86();
diff --git a/src/layer/x86/mish_x86.cpp b/src/layer/x86/mish_x86.cpp
index e55a5e1f808..90ce135c19a 100644
--- a/src/layer/x86/mish_x86.cpp
+++ b/src/layer/x86/mish_x86.cpp
@@ -31,64 +31,8 @@ int Mish_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
- int size = w * h * d;
-#if __SSE2__
int elempack = bottom_top_blob.elempack;
-
-#if __AVX__
-#if __AVX512F__
- if (elempack == 16)
- {
- Mat tmp;
- convert_packing(bottom_top_blob, tmp, 8, opt);
-
- forward_inplace(tmp, opt);
-
- convert_packing(tmp, bottom_top_blob, 16, opt);
-
- return 0;
- }
-#endif // __AVX512F__
-
- if (elempack == 8)
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int q = 0; q < channels; q++)
- {
- float* ptr = bottom_top_blob.channel(q);
-
- for (int i = 0; i < size; i++)
- {
- __m256 _p = _mm256_loadu_ps(ptr);
- _p = mish_avx(_p);
- _mm256_storeu_ps(ptr, _p);
- ptr += 8;
- }
- }
-
- return 0;
- }
-#endif // __AVX__
-
- if (elempack == 4)
- {
- #pragma omp parallel for num_threads(opt.num_threads)
- for (int q = 0; q < channels; q++)
- {
- float* ptr = bottom_top_blob.channel(q);
-
- for (int i = 0; i < size; i++)
- {
- __m128 _p = _mm_loadu_ps(ptr);
- _p = mish_sse(_p);
- _mm_storeu_ps(ptr, _p);
- ptr += 4;
- }
- }
-
- return 0;
- }
-#endif // __SSE2__
+ int size = w * h * d * elempack;
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
@@ -98,6 +42,15 @@ int Mish_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
int i = 0;
#if __SSE2__
#if __AVX__
+#if __AVX512F__
+ for (; i + 15 < size; i += 16)
+ {
+ __m512 _p = _mm512_loadu_ps(ptr);
+ _p = mish_avx512(_p);
+ _mm512_storeu_ps(ptr, _p);
+ ptr += 16;
+ }
+#endif
for (; i + 7 < size; i += 8)
{
__m256 _p = _mm256_loadu_ps(ptr);
diff --git a/src/layer/x86/mish_x86.h b/src/layer/x86/mish_x86.h
index fe625e2ca37..dce8823c6f5 100644
--- a/src/layer/x86/mish_x86.h
+++ b/src/layer/x86/mish_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Mish_x86 : virtual public Mish
+class Mish_x86 : public Mish
{
public:
Mish_x86();
diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp
index 98397437c9d..2bddad5582d 100644
--- a/src/layer/x86/multiheadattention_x86.cpp
+++ b/src/layer/x86/multiheadattention_x86.cpp
@@ -42,7 +42,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
const int embed_dim_per_head = embed_dim / num_heads;
const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
- q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ q_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(0, inv_sqrt_embed_dim_per_head);
pd.set(1, 1.f);
@@ -65,15 +65,12 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
q_gemm->load_model(ModelBinFromMatArray(weights));
q_gemm->create_pipeline(opt);
- if (opt.lightmode)
- {
- q_weight_data.release();
- q_bias_data.release();
- }
+ q_weight_data.release();
+ q_bias_data.release();
}
{
- k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ k_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 0); // transA
pd.set(3, 1); // transB
@@ -94,15 +91,12 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
k_gemm->load_model(ModelBinFromMatArray(weights));
k_gemm->create_pipeline(opt);
- if (opt.lightmode)
- {
- k_weight_data.release();
- k_bias_data.release();
- }
+ k_weight_data.release();
+ k_bias_data.release();
}
{
- v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ v_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 0); // transA
pd.set(3, 1); // transB
@@ -123,15 +117,12 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
v_gemm->load_model(ModelBinFromMatArray(weights));
v_gemm->create_pipeline(opt);
- if (opt.lightmode)
- {
- v_weight_data.release();
- v_bias_data.release();
- }
+ v_weight_data.release();
+ v_bias_data.release();
}
{
- qk_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ qk_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 1); // transA
pd.set(3, 0); // transB
@@ -151,7 +142,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
qk_gemm->create_pipeline(opt1);
}
{
- qkv_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ qkv_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 0); // transA
pd.set(3, 1); // transB
@@ -173,7 +164,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
}
{
- qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+ qk_softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
ncnn::ParamDict pd;
pd.set(0, -1);
pd.set(1, 1);
@@ -183,7 +174,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
}
{
- o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+ o_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
ncnn::ParamDict pd;
pd.set(2, 1); // transA
pd.set(3, 1); // transB
@@ -202,11 +193,8 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
o_gemm->load_model(ModelBinFromMatArray(weights));
o_gemm->create_pipeline(opt);
- if (opt.lightmode)
- {
- out_weight_data.release();
- out_bias_data.release();
- }
+ out_weight_data.release();
+ out_bias_data.release();
}
return 0;
diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h
index a19a18001f5..55ea41780dd 100644
--- a/src/layer/x86/multiheadattention_x86.h
+++ b/src/layer/x86/multiheadattention_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class MultiHeadAttention_x86 : virtual public MultiHeadAttention
+class MultiHeadAttention_x86 : public MultiHeadAttention
{
public:
MultiHeadAttention_x86();
diff --git a/src/layer/x86/packing_x86.h b/src/layer/x86/packing_x86.h
index a00e74a4411..9f8f368039d 100644
--- a/src/layer/x86/packing_x86.h
+++ b/src/layer/x86/packing_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Packing_x86 : virtual public Packing
+class Packing_x86 : public Packing
{
public:
Packing_x86();
diff --git a/src/layer/x86/padding_x86.h b/src/layer/x86/padding_x86.h
index f01a4a19757..8772fe30eed 100644
--- a/src/layer/x86/padding_x86.h
+++ b/src/layer/x86/padding_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Padding_x86 : virtual public Padding
+class Padding_x86 : public Padding
{
public:
Padding_x86();
diff --git a/src/layer/x86/pooling_x86.h b/src/layer/x86/pooling_x86.h
index b79685c1840..030964fcb4d 100644
--- a/src/layer/x86/pooling_x86.h
+++ b/src/layer/x86/pooling_x86.h
@@ -22,7 +22,7 @@
namespace ncnn {
-class Pooling_x86 : virtual public Pooling
+class Pooling_x86 : public Pooling
{
public:
Pooling_x86();
diff --git a/src/layer/x86/prelu_x86.h b/src/layer/x86/prelu_x86.h
index 6bbfeae0f0d..17d60d4b297 100644
--- a/src/layer/x86/prelu_x86.h
+++ b/src/layer/x86/prelu_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class PReLU_x86 : virtual public PReLU
+class PReLU_x86 : public PReLU
{
public:
PReLU_x86();
diff --git a/src/layer/x86/quantize_x86.h b/src/layer/x86/quantize_x86.h
index 6fb2d41d662..5c743fe4cff 100644
--- a/src/layer/x86/quantize_x86.h
+++ b/src/layer/x86/quantize_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Quantize_x86 : virtual public Quantize
+class Quantize_x86 : public Quantize
{
public:
Quantize_x86();
diff --git a/src/layer/x86/relu_x86.h b/src/layer/x86/relu_x86.h
index 6d3cce1c5d8..9d0b5966f53 100644
--- a/src/layer/x86/relu_x86.h
+++ b/src/layer/x86/relu_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ReLU_x86 : virtual public ReLU
+class ReLU_x86 : public ReLU
{
public:
ReLU_x86();
diff --git a/src/layer/x86/requantize_x86.h b/src/layer/x86/requantize_x86.h
index 02b6880f0e9..febc418654f 100644
--- a/src/layer/x86/requantize_x86.h
+++ b/src/layer/x86/requantize_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Requantize_x86 : virtual public Requantize
+class Requantize_x86 : public Requantize
{
public:
Requantize_x86();
diff --git a/src/layer/x86/reshape_x86.h b/src/layer/x86/reshape_x86.h
index a29b91c1b50..56c8ddfb357 100644
--- a/src/layer/x86/reshape_x86.h
+++ b/src/layer/x86/reshape_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Reshape_x86 : virtual public Reshape
+class Reshape_x86 : public Reshape
{
public:
Reshape_x86();
diff --git a/src/layer/x86/roialign_x86.h b/src/layer/x86/roialign_x86.h
index f1c4ff912b3..1b91c1a8cbe 100644
--- a/src/layer/x86/roialign_x86.h
+++ b/src/layer/x86/roialign_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ROIAlign_x86 : virtual public ROIAlign
+class ROIAlign_x86 : public ROIAlign
{
public:
ROIAlign_x86();
diff --git a/src/layer/x86/scale_x86.h b/src/layer/x86/scale_x86.h
index 840e6903c33..f06cf414688 100644
--- a/src/layer/x86/scale_x86.h
+++ b/src/layer/x86/scale_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Scale_x86 : virtual public Scale
+class Scale_x86 : public Scale
{
public:
Scale_x86();
diff --git a/src/layer/x86/selu_x86.h b/src/layer/x86/selu_x86.h
index d7b5bf8a87e..7f4a78f80ed 100644
--- a/src/layer/x86/selu_x86.h
+++ b/src/layer/x86/selu_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class SELU_x86 : virtual public SELU
+class SELU_x86 : public SELU
{
public:
SELU_x86();
diff --git a/src/layer/x86/shufflechannel_x86.h b/src/layer/x86/shufflechannel_x86.h
index 6adca483c17..1e4328a2560 100644
--- a/src/layer/x86/shufflechannel_x86.h
+++ b/src/layer/x86/shufflechannel_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class ShuffleChannel_x86 : virtual public ShuffleChannel
+class ShuffleChannel_x86 : public ShuffleChannel
{
public:
ShuffleChannel_x86();
diff --git a/src/layer/x86/sigmoid_x86.h b/src/layer/x86/sigmoid_x86.h
index 05ea2c40f11..52bf85d9eaf 100644
--- a/src/layer/x86/sigmoid_x86.h
+++ b/src/layer/x86/sigmoid_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Sigmoid_x86 : virtual public Sigmoid
+class Sigmoid_x86 : public Sigmoid
{
public:
Sigmoid_x86();
diff --git a/src/layer/x86/slice_x86.h b/src/layer/x86/slice_x86.h
index fd6fbf9a1b7..0c9b266f84d 100644
--- a/src/layer/x86/slice_x86.h
+++ b/src/layer/x86/slice_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Slice_x86 : virtual public Slice
+class Slice_x86 : public Slice
{
public:
Slice_x86();
diff --git a/src/layer/x86/softmax_x86.h b/src/layer/x86/softmax_x86.h
index c899dcd1cc8..3d1b733a9ec 100644
--- a/src/layer/x86/softmax_x86.h
+++ b/src/layer/x86/softmax_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Softmax_x86 : virtual public Softmax
+class Softmax_x86 : public Softmax
{
public:
Softmax_x86();
diff --git a/src/layer/x86/swish_x86.h b/src/layer/x86/swish_x86.h
index 03c6d5e4b30..76b7c3d83f6 100644
--- a/src/layer/x86/swish_x86.h
+++ b/src/layer/x86/swish_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Swish_x86 : virtual public Swish
+class Swish_x86 : public Swish
{
public:
Swish_x86();
diff --git a/src/layer/x86/tanh_x86.h b/src/layer/x86/tanh_x86.h
index 60913d49c7b..e4c4477bc56 100644
--- a/src/layer/x86/tanh_x86.h
+++ b/src/layer/x86/tanh_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class TanH_x86 : virtual public TanH
+class TanH_x86 : public TanH
{
public:
TanH_x86();
diff --git a/src/layer/x86/unaryop_x86.h b/src/layer/x86/unaryop_x86.h
index 8e8f6c4d2de..0e4a7ff59e1 100644
--- a/src/layer/x86/unaryop_x86.h
+++ b/src/layer/x86/unaryop_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class UnaryOp_x86 : virtual public UnaryOp
+class UnaryOp_x86 : public UnaryOp
{
public:
UnaryOp_x86();
diff --git a/src/layer/x86/yolov3detectionoutput_x86.h b/src/layer/x86/yolov3detectionoutput_x86.h
index ef93d4647f8..c378b5827b7 100644
--- a/src/layer/x86/yolov3detectionoutput_x86.h
+++ b/src/layer/x86/yolov3detectionoutput_x86.h
@@ -19,7 +19,7 @@
namespace ncnn {
-class Yolov3DetectionOutput_x86 : virtual public Yolov3DetectionOutput
+class Yolov3DetectionOutput_x86 : public Yolov3DetectionOutput
{
public:
Yolov3DetectionOutput_x86();
diff --git a/src/layer/yolodetectionoutput.cpp b/src/layer/yolodetectionoutput.cpp
index 9b9ba7dc289..1e0d86d73a4 100644
--- a/src/layer/yolodetectionoutput.cpp
+++ b/src/layer/yolodetectionoutput.cpp
@@ -38,7 +38,7 @@ int YoloDetectionOutput::load_param(const ParamDict& pd)
int YoloDetectionOutput::create_pipeline(const Option& opt)
{
{
- softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+ softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
ncnn::ParamDict pd;
pd.set(0, 0); // axis
diff --git a/src/layer/yolov3detectionoutput.cpp b/src/layer/yolov3detectionoutput.cpp
index 494fb6d186a..7528f5033cd 100644
--- a/src/layer/yolov3detectionoutput.cpp
+++ b/src/layer/yolov3detectionoutput.cpp
@@ -25,7 +25,7 @@ Yolov3DetectionOutput::Yolov3DetectionOutput()
one_blob_only = false;
support_inplace = false;
- //softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+ //softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
// set param
ncnn::ParamDict pd;
diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in
index 52393b498e6..dfe8e73ce79 100644
--- a/src/layer_registry.h.in
+++ b/src/layer_registry.h.in
@@ -6,16 +6,22 @@ static const layer_registry_entry layer_registry[] = {
@layer_registry@
};
+static const layer_registry_entry layer_registry_arch[] = {
+@layer_registry_arch@
+};
+
#if NCNN_RUNTIME_CPU && NCNN_AVX512
static const layer_registry_entry layer_registry_avx512[] = {
@layer_registry_avx512@
};
#endif // NCNN_RUNTIME_CPU && NCNN_AVX512
+
#if NCNN_RUNTIME_CPU && NCNN_FMA
static const layer_registry_entry layer_registry_fma[] = {
@layer_registry_fma@
};
#endif // NCNN_RUNTIME_CPU && NCNN_FMA
+
#if NCNN_RUNTIME_CPU && NCNN_AVX
static const layer_registry_entry layer_registry_avx[] = {
@layer_registry_avx@
@@ -45,3 +51,9 @@ static const layer_registry_entry layer_registry_rvv[] = {
@layer_registry_rvv@
};
#endif // NCNN_RUNTIME_CPU && NCNN_RVV
+
+#if NCNN_VULKAN
+static const layer_registry_entry layer_registry_vulkan[] = {
+@layer_registry_vulkan@
+};
+#endif // NCNN_VULKAN
diff --git a/src/mat_pixel_resize.cpp b/src/mat_pixel_resize.cpp
index a559a7dac04..f28ce061bca 100644
--- a/src/mat_pixel_resize.cpp
+++ b/src/mat_pixel_resize.cpp
@@ -38,12 +38,12 @@ static void vresize_two(const short* rows0p, const short* rows1p, int wsize, uns
int16x8_t _r01 = vld1q_s16(rows0p + 8);
int16x8_t _r10 = vld1q_s16(rows1p);
int16x8_t _r11 = vld1q_s16(rows1p + 8);
- int16x8_t _acc00 = vaddq_s16(vqdmulhq_s16(_r00, _b0), vqdmulhq_s16(_r10, _b1));
- int16x8_t _acc01 = vaddq_s16(vqdmulhq_s16(_r01, _b0), vqdmulhq_s16(_r11, _b1));
- int16x8_t _acc10 = vaddq_s16(vqdmulhq_s16(_r00, _b2), vqdmulhq_s16(_r10, _b3));
- int16x8_t _acc11 = vaddq_s16(vqdmulhq_s16(_r01, _b2), vqdmulhq_s16(_r11, _b3));
- uint8x16_t _Dp0 = vcombine_u8(vqrshrun_n_s16(_acc00, 3), vqrshrun_n_s16(_acc01, 3));
- uint8x16_t _Dp1 = vcombine_u8(vqrshrun_n_s16(_acc10, 3), vqrshrun_n_s16(_acc11, 3));
+ int16x8_t _acc00 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r00, _b0), 1), vqdmulhq_s16(_r10, _b1), 1);
+ int16x8_t _acc01 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r01, _b0), 1), vqdmulhq_s16(_r11, _b1), 1);
+ int16x8_t _acc10 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r00, _b2), 1), vqdmulhq_s16(_r10, _b3), 1);
+ int16x8_t _acc11 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r01, _b2), 1), vqdmulhq_s16(_r11, _b3), 1);
+ uint8x16_t _Dp0 = vcombine_u8(vqrshrun_n_s16(_acc00, 2), vqrshrun_n_s16(_acc01, 2));
+ uint8x16_t _Dp1 = vcombine_u8(vqrshrun_n_s16(_acc10, 2), vqrshrun_n_s16(_acc11, 2));
vst1q_u8(Dp0, _Dp0);
vst1q_u8(Dp1, _Dp1);
Dp0 += 16;
@@ -55,10 +55,10 @@ static void vresize_two(const short* rows0p, const short* rows1p, int wsize, uns
{
int16x8_t _r0 = vld1q_s16(rows0p);
int16x8_t _r1 = vld1q_s16(rows1p);
- int16x8_t _acc0 = vaddq_s16(vqdmulhq_s16(_r0, _b0), vqdmulhq_s16(_r1, _b1));
- int16x8_t _acc1 = vaddq_s16(vqdmulhq_s16(_r0, _b2), vqdmulhq_s16(_r1, _b3));
- uint8x8_t _Dp0 = vqrshrun_n_s16(_acc0, 3);
- uint8x8_t _Dp1 = vqrshrun_n_s16(_acc1, 3);
+ int16x8_t _acc0 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r0, _b0), 1), vqdmulhq_s16(_r1, _b1), 1);
+ int16x8_t _acc1 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r0, _b2), 1), vqdmulhq_s16(_r1, _b3), 1);
+ uint8x8_t _Dp0 = vqrshrun_n_s16(_acc0, 2);
+ uint8x8_t _Dp1 = vqrshrun_n_s16(_acc1, 2);
vst1_u8(Dp0, _Dp0);
vst1_u8(Dp1, _Dp1);
Dp0 += 8;
@@ -136,9 +136,9 @@ static void vresize_one(const short* rows0p, const short* rows1p, int wsize, uns
int16x8_t _r01 = vld1q_s16(rows0p + 8);
int16x8_t _r10 = vld1q_s16(rows1p);
int16x8_t _r11 = vld1q_s16(rows1p + 8);
- int16x8_t _acc0 = vaddq_s16(vqdmulhq_s16(_r00, _b0), vqdmulhq_s16(_r10, _b1));
- int16x8_t _acc1 = vaddq_s16(vqdmulhq_s16(_r01, _b0), vqdmulhq_s16(_r11, _b1));
- uint8x16_t _Dp = vcombine_u8(vqrshrun_n_s16(_acc0, 3), vqrshrun_n_s16(_acc1, 3));
+ int16x8_t _acc0 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r00, _b0), 1), vqdmulhq_s16(_r10, _b1), 1);
+ int16x8_t _acc1 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r01, _b0), 1), vqdmulhq_s16(_r11, _b1), 1);
+ uint8x16_t _Dp = vcombine_u8(vqrshrun_n_s16(_acc0, 2), vqrshrun_n_s16(_acc1, 2));
vst1q_u8(Dp, _Dp);
Dp += 16;
rows0p += 16;
@@ -148,8 +148,8 @@ static void vresize_one(const short* rows0p, const short* rows1p, int wsize, uns
{
int16x8_t _r0 = vld1q_s16(rows0p);
int16x8_t _r1 = vld1q_s16(rows1p);
- int16x8_t _acc = vaddq_s16(vqdmulhq_s16(_r0, _b0), vqdmulhq_s16(_r1, _b1));
- uint8x8_t _Dp = vqrshrun_n_s16(_acc, 3);
+ int16x8_t _acc = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r0, _b0), 1), vqdmulhq_s16(_r1, _b1), 1);
+ uint8x8_t _Dp = vqrshrun_n_s16(_acc, 2);
vst1_u8(Dp, _Dp);
Dp += 8;
rows0p += 8;
diff --git a/src/net.cpp b/src/net.cpp
index a7198d0a16e..ff2ab609137 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -126,6 +126,9 @@ static Option get_masked_option(const Option& opt, int featmask)
opt1.use_sgemm_convolution = opt1.use_sgemm_convolution && !(featmask & (1 << 5));
opt1.use_winograd_convolution = opt1.use_winograd_convolution && !(featmask & (1 << 6));
+ if (featmask & (1 << 7))
+ opt1.num_threads = 1;
+
return opt1;
}
@@ -145,6 +148,8 @@ int NetPrivate::upload_model()
}
Option opt_upload = opt;
+ opt_upload.blob_allocator = 0;
+ opt_upload.workspace_allocator = 0;
opt_upload.blob_vkallocator = weight_vkallocator;
opt_upload.workspace_vkallocator = weight_vkallocator;
opt_upload.staging_vkallocator = weight_staging_vkallocator;
@@ -616,15 +621,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
// clang-format off
// *INDENT-OFF*
-#if NCNN_ARM82
- if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && layer->support_fp16_storage)
+#if NCNN_VFPV4
+ if (opt.use_fp16_storage && cpu_support_arm_vfpv4() && layer->support_fp16_storage)
{
Mat bottom_blob_fp16;
cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
bottom_blob = bottom_blob_fp16;
}
else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
#if NCNN_RVV
if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && layer->support_fp16_storage)
{
@@ -726,15 +731,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
// clang-format off
// *INDENT-OFF*
-#if NCNN_ARM82
- if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && !layer->support_fp16_storage)
+#if NCNN_VFPV4
+ if (opt.use_fp16_storage && cpu_support_arm_vfpv4() && !layer->support_fp16_storage)
{
Mat bottom_blob_fp32;
cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
bottom_blob = bottom_blob_fp32;
}
else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
#if NCNN_RVV
if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && !layer->support_fp16_storage)
{
@@ -1342,8 +1347,11 @@ int Net::load_param(const DataReader& dr)
// sanitize use options
if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
+ if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
+ if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
+ if (!d->vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
@@ -1354,6 +1362,9 @@ int Net::load_param(const DataReader& dr)
// fp16a makes no sense when fp16 storage disabled
if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
+
+ // fp16 uniform makes no sense when fp16 arithmetic disabled
+ if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
}
else
{
@@ -1377,9 +1388,15 @@ int Net::load_param(const DataReader& dr)
SCAN_VALUE("%d", top_count)
Layer* layer = create_overwrite_builtin_layer(layer_type);
+#if NCNN_VULKAN
+ if (!layer && opt.use_vulkan_compute && d->vkdev)
+ {
+ layer = create_layer_vulkan(layer_type);
+ }
+#endif // NCNN_VULKAN
if (!layer)
{
- layer = create_layer(layer_type);
+ layer = create_layer_cpu(layer_type);
}
if (!layer)
{
@@ -1402,7 +1419,6 @@ int Net::load_param(const DataReader& dr)
// NCNN_LOGE("new layer %d %s", i, layer_name);
layer->bottoms.resize(bottom_count);
-
for (int j = 0; j < bottom_count; j++)
{
char bottom_name[256];
@@ -1446,20 +1462,16 @@ int Net::load_param(const DataReader& dr)
blob_index++;
}
+ int layer_support_vulkan = layer->support_vulkan;
+
// layer specific params
int pdlr = pd.load_param(dr);
if (pdlr != 0)
{
- NCNN_LOGE("ParamDict load_param %d %s failed", i, layer->name.c_str());
+ NCNN_LOGE("ParamDict load_param %d %s failed", i, layer_name);
continue;
}
- if (layer->support_int8_storage)
- {
- // no int8 gpu support yet
- opt.use_vulkan_compute = false;
- }
-
// pull out top shape hints
Mat shape_hints = pd.get(30, Mat());
if (!shape_hints.empty())
@@ -1506,10 +1518,62 @@ int Net::load_param(const DataReader& dr)
int lr = layer->load_param(pd);
if (lr != 0)
{
- NCNN_LOGE("layer load_param %d %s failed", i, layer->name.c_str());
+ NCNN_LOGE("layer load_param %d %s failed", i, layer_name);
continue;
}
+ if (layer->support_int8_storage)
+ {
+ // no int8 gpu support yet
+ opt.use_vulkan_compute = false;
+ }
+
+ Option opt1 = get_masked_option(opt, layer->featmask);
+#if NCNN_VULKAN
+ if (opt1.use_vulkan_compute)
+ {
+ if (!layer->support_image_storage) opt1.use_image_storage = false;
+ }
+#endif // NCNN_VULKAN
+
+ if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute))
+ {
+ // vulkan layer cannot handle these param, recreate cpu layer
+ Layer* layer_cpu = create_overwrite_builtin_layer(layer_type);
+ if (!layer_cpu)
+ {
+ layer_cpu = create_layer_cpu(layer_type);
+ }
+ if (!layer_cpu)
+ {
+ layer_cpu = create_custom_layer(layer_type);
+ }
+ if (!layer_cpu)
+ {
+ NCNN_LOGE("layer %s not exists or registered", layer_type);
+ clear();
+ return -1;
+ }
+
+ layer_cpu->type = layer->type;
+ layer_cpu->name = layer->name;
+ layer_cpu->bottoms = layer->bottoms;
+ layer_cpu->tops = layer->tops;
+ layer_cpu->bottom_shapes = layer->bottom_shapes;
+ layer_cpu->top_shapes = layer->top_shapes;
+ layer_cpu->featmask = layer->featmask;
+
+ int lr = layer_cpu->load_param(pd);
+ if (lr != 0)
+ {
+ NCNN_LOGE("layer load_param %d %s failed", i, layer_name);
+ continue;
+ }
+
+ delete layer;
+ layer = layer_cpu;
+ }
+
d->layers[i] = layer;
}
@@ -1579,8 +1643,11 @@ int Net::load_param_bin(const DataReader& dr)
// sanitize use options
if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
+ if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
+ if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
+ if (!d->vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
@@ -1591,6 +1658,9 @@ int Net::load_param_bin(const DataReader& dr)
// fp16a makes no sense when fp16 storage disabled
if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
+
+ // fp16 uniform makes no sense when fp16 arithmetic disabled
+ if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
}
else
{
@@ -1611,9 +1681,15 @@ int Net::load_param_bin(const DataReader& dr)
READ_VALUE(top_count)
Layer* layer = create_overwrite_builtin_layer(typeindex);
+#if NCNN_VULKAN
+ if (!layer && opt.use_vulkan_compute && d->vkdev)
+ {
+ layer = create_layer_vulkan(typeindex);
+ }
+#endif // NCNN_VULKAN
if (!layer)
{
- layer = create_layer(typeindex);
+ layer = create_layer_cpu(typeindex);
}
if (!layer)
{
@@ -1665,24 +1741,16 @@ int Net::load_param_bin(const DataReader& dr)
layer->tops[j] = top_blob_index;
}
+ int layer_support_vulkan = layer->support_vulkan;
+
// layer specific params
int pdlr = pd.load_param_bin(dr);
if (pdlr != 0)
{
-#if NCNN_STRING
- NCNN_LOGE("ParamDict load_param %d %s failed", i, layer->name.c_str());
-#else
- NCNN_LOGE("ParamDict load_param %d failed", i);
-#endif
+ NCNN_LOGE("ParamDict load_param_bin %d failed", i);
continue;
}
- if (layer->support_int8_storage)
- {
- // no int8 gpu support yet
- opt.use_vulkan_compute = false;
- }
-
// pull out top blob shape hints
Mat shape_hints = pd.get(30, Mat());
if (!shape_hints.empty())
@@ -1729,14 +1797,61 @@ int Net::load_param_bin(const DataReader& dr)
int lr = layer->load_param(pd);
if (lr != 0)
{
-#if NCNN_STRING
- NCNN_LOGE("layer load_param %d %s failed", i, layer->name.c_str());
-#else
NCNN_LOGE("layer load_param %d failed", i);
-#endif
continue;
}
+ if (layer->support_int8_storage)
+ {
+ // no int8 gpu support yet
+ opt.use_vulkan_compute = false;
+ }
+
+ Option opt1 = get_masked_option(opt, layer->featmask);
+#if NCNN_VULKAN
+ if (opt1.use_vulkan_compute)
+ {
+ if (!layer->support_image_storage) opt1.use_image_storage = false;
+ }
+#endif // NCNN_VULKAN
+
+ if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute))
+ {
+ // vulkan layer cannot handle these param, recreate cpu layer
+ Layer* layer_cpu = create_overwrite_builtin_layer(typeindex);
+ if (!layer_cpu)
+ {
+ layer_cpu = create_layer_cpu(typeindex);
+ }
+ if (!layer_cpu)
+ {
+ int custom_index = typeindex & ~LayerType::CustomBit;
+ layer_cpu = create_custom_layer(custom_index);
+ }
+ if (!layer_cpu)
+ {
+ NCNN_LOGE("layer %d not exists or registered", typeindex);
+ clear();
+ return -1;
+ }
+
+ layer_cpu->bottoms = layer->bottoms;
+ layer_cpu->tops = layer->tops;
+ layer_cpu->bottom_shapes = layer->bottom_shapes;
+ layer_cpu->top_shapes = layer->top_shapes;
+ layer_cpu->featmask = layer->featmask;
+
+ int lr = layer_cpu->load_param(pd);
+ if (lr != 0)
+ {
+ NCNN_LOGE("layer load_param %d failed", i);
+ continue;
+ }
+
+ delete layer;
+ layer = layer_cpu;
+ }
+
d->layers[i] = layer;
}
@@ -1796,24 +1911,7 @@ int Net::load_model(const DataReader& dr)
break;
}
- if (layer->support_int8_storage)
- {
- // no int8 gpu support yet
- opt.use_vulkan_compute = false;
- }
-
Option opt1 = get_masked_option(opt, layer->featmask);
-#if NCNN_VULKAN
- if (opt1.use_vulkan_compute)
- {
- if (!layer->support_image_storage) opt1.use_image_storage = false;
- }
- else
- {
- layer->vkdev = 0;
- layer->support_vulkan = false;
- }
-#endif // NCNN_VULKAN
int cret = layer->create_pipeline(opt1);
if (cret != 0)
@@ -2378,7 +2476,8 @@ void Extractor::set_light_mode(bool enable)
void Extractor::set_num_threads(int num_threads)
{
- d->opt.num_threads = num_threads;
+ NCNN_LOGE("ex.set_num_threads() is no-op, please set net.opt.num_threads=N before net.load_param()");
+ NCNN_LOGE("If you want to use single thread for only some layer, see https://github.com/Tencent/ncnn/wiki/layer-feat-mask");
}
void Extractor::set_blob_allocator(Allocator* allocator)
@@ -2394,14 +2493,8 @@ void Extractor::set_workspace_allocator(Allocator* allocator)
#if NCNN_VULKAN
void Extractor::set_vulkan_compute(bool enable)
{
- if (d->net->d->opt.use_vulkan_compute)
- {
- d->opt.use_vulkan_compute = enable;
- }
- else
- {
- NCNN_LOGE("set_vulkan_compute failed, network use_vulkan_compute disabled");
- }
+ NCNN_LOGE("ex.set_vulkan_compute() is no-op, please set net.opt.use_vulkan_compute=true/false before net.load_param()");
+ NCNN_LOGE("If you want to disable vulkan for only some layer, see https://github.com/Tencent/ncnn/wiki/layer-feat-mask");
}
void Extractor::set_blob_vkallocator(VkAllocator* allocator)
@@ -2598,8 +2691,8 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
// clang-format off
// *INDENT-OFF*
-#if NCNN_ARM82
- if (d->opt.use_fp16_storage && cpu_support_arm_asimdhp() && (type == 0))
+#if NCNN_VFPV4
+ if (d->opt.use_fp16_storage && cpu_support_arm_vfpv4() && (type == 0))
{
if (feat.elembits() == 16)
{
@@ -2609,7 +2702,7 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
}
}
else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
#if NCNN_BF16
if (d->opt.use_bf16_storage && (type == 0))
{
diff --git a/src/net.h b/src/net.h
index 98e3ec335f1..d69443bbd3c 100644
--- a/src/net.h
+++ b/src/net.h
@@ -182,9 +182,8 @@ class NCNN_EXPORT Extractor
// enabled by default
void set_light_mode(bool enable);
- // set thread count for this extractor
- // this will overwrite the global setting
- // default count is system depended
+ // deprecated, no-op
+ // instead, set net.opt.num_threads before net.load_param()
void set_num_threads(int num_threads);
// set blob memory allocator
@@ -194,6 +193,8 @@ class NCNN_EXPORT Extractor
void set_workspace_allocator(Allocator* allocator);
#if NCNN_VULKAN
+ // deprecated, no-op
+ // instead, set net.opt.use_vulkan_compute before net.load_param()
void set_vulkan_compute(bool enable);
void set_blob_vkallocator(VkAllocator* allocator);
diff --git a/src/option.cpp b/src/option.cpp
index ea2dd6d25c8..a30dabe55f8 100644
--- a/src/option.cpp
+++ b/src/option.cpp
@@ -74,6 +74,9 @@ Option::Option()
use_winograd63_convolution = true;
use_a53_a55_optimized_kernel = is_current_thread_running_on_a53_a55();
+
+ use_fp16_uniform = true;
+ use_int8_uniform = true;
}
} // namespace ncnn
diff --git a/src/option.h b/src/option.h
index 7d0cc60ba7d..eb2a5a7d342 100644
--- a/src/option.h
+++ b/src/option.h
@@ -144,8 +144,10 @@ class NCNN_EXPORT Option
// but you can force this on/off if you wish
bool use_a53_a55_optimized_kernel;
- bool use_reserved_7;
- bool use_reserved_8;
+ // enable options for shared variables in gpu shader
+ bool use_fp16_uniform;
+ bool use_int8_uniform;
+
bool use_reserved_9;
bool use_reserved_10;
bool use_reserved_11;
diff --git a/src/ruapu.h b/src/ruapu.h
new file mode 100644
index 00000000000..ff3c19e2c46
--- /dev/null
+++ b/src/ruapu.h
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024 nihui (https://github.com/nihui)
+// Copyright (c) 2024 kernelbin (https://github.com/kernelbin)
+//
+// ruapu --- detect cpu isa features with single-file
+
+#ifndef RUAPU_H
+#define RUAPU_H
+
+void ruapu_init();
+
+int ruapu_supports(const char* isa);
+
+#ifdef RUAPU_IMPLEMENTATION
+
+#include
+#include
+
+#if defined _WIN32
+
+#include
+
+#if WINAPI_FAMILY == WINAPI_FAMILY_APP
+static int ruapu_detect_isa(const void* some_inst)
+{
+ // uwp does not support seh :(
+ (void)some_inst;
+ return 0;
+}
+#else // WINAPI_FAMILY == WINAPI_FAMILY_APP
+static int g_ruapu_sigill_caught = 0;
+static jmp_buf g_ruapu_jmpbuf;
+
+typedef const void* ruapu_some_inst;
+
+static LONG CALLBACK ruapu_catch_sigill(struct _EXCEPTION_POINTERS* ExceptionInfo)
+{
+ if (ExceptionInfo->ExceptionRecord->ExceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION)
+ {
+ g_ruapu_sigill_caught = 1;
+ longjmp(g_ruapu_jmpbuf, -1);
+ }
+
+ return EXCEPTION_CONTINUE_SEARCH;
+}
+
+static int ruapu_detect_isa(const void* some_inst)
+{
+ g_ruapu_sigill_caught = 0;
+
+ PVOID eh = AddVectoredExceptionHandler(1, ruapu_catch_sigill);
+
+ if (setjmp(g_ruapu_jmpbuf) == 0)
+ {
+ ((void (*)())some_inst)();
+ }
+
+ RemoveVectoredExceptionHandler(eh);
+
+ return g_ruapu_sigill_caught ? 0 : 1;
+}
+#endif // WINAPI_FAMILY == WINAPI_FAMILY_APP
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+#ifdef _MSC_VER
+#define RUAPU_INSTCODE(isa, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned char ruapu_some_##isa[] = { __VA_ARGS__, 0xc3 };
+#else
+#define RUAPU_INSTCODE(isa, ...) __attribute__((section(".text"))) static unsigned char ruapu_some_##isa[] = { __VA_ARGS__, 0xc3 };
+#endif
+
+#elif __aarch64__ || defined(_M_ARM64)
+#ifdef _MSC_VER
+#define RUAPU_INSTCODE(isa, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0xd65f03c0 };
+#else
+#define RUAPU_INSTCODE(isa, ...) __attribute__((section(".text"))) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0xd65f03c0 };
+#endif
+
+#elif __arm__ || defined(_M_ARM)
+#if __thumb__
+#ifdef _MSC_VER
+#define RUAPU_INSTCODE(isa, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0x4770 };
+#else
+#define RUAPU_INSTCODE(isa, ...) __attribute__((section(".text"))) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0x4770 };
+#endif
+#else
+#ifdef _MSC_VER
+#define RUAPU_INSTCODE(isa, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0xe12fff1e };
+#else
+#define RUAPU_INSTCODE(isa, ...) __attribute__((section(".text"))) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0xe12fff1e };
+#endif
+#endif
+
+#endif
+
+#elif defined __ANDROID__ || defined __linux__ || defined __APPLE__
+#include
+
+static int g_ruapu_sigill_caught = 0;
+static sigjmp_buf g_ruapu_jmpbuf;
+
+typedef void (*ruapu_some_inst)();
+
+static void ruapu_catch_sigill(int signo, siginfo_t* si, void* data)
+{
+ (void)signo;
+ (void)si;
+ (void)data;
+
+ g_ruapu_sigill_caught = 1;
+ siglongjmp(g_ruapu_jmpbuf, -1);
+}
+
+static int ruapu_detect_isa(ruapu_some_inst some_inst)
+{
+ g_ruapu_sigill_caught = 0;
+
+ struct sigaction sa = { 0 };
+ struct sigaction old_sa;
+ sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO;
+ sa.sa_sigaction = ruapu_catch_sigill;
+ sigaction(SIGILL, &sa, &old_sa);
+
+ if (sigsetjmp(g_ruapu_jmpbuf, 1) == 0)
+ {
+ some_inst();
+ }
+
+ sigaction(SIGILL, &old_sa, NULL);
+
+ return g_ruapu_sigill_caught ? 0 : 1;
+}
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+#define RUAPU_INSTCODE(isa, ...) static void ruapu_some_##isa() { asm volatile(".byte " #__VA_ARGS__ : : : ); }
+#elif __aarch64__
+#define RUAPU_INSTCODE(isa, ...) static void ruapu_some_##isa() { asm volatile(".word " #__VA_ARGS__ : : : ); }
+#elif __arm__
+#define RUAPU_INSTCODE(isa, ...) static void ruapu_some_##isa() { asm volatile(".word " #__VA_ARGS__ : : : ); }
+#endif
+
+#else // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
+typedef const void* ruapu_some_inst;
+static int ruapu_detect_isa(const void* some_inst)
+{
+ // unknown platform, bare metal os ?
+ (void)some_inst;
+ return 0;
+}
+
+#define RUAPU_INSTCODE(isa, ...) static void ruapu_some_##isa() { }
+#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
+
+struct ruapu_isa_entry
+{
+ const char* isa;
+ ruapu_some_inst inst;
+ int capable;
+};
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+RUAPU_INSTCODE(mmx, 0x0f, 0xdb, 0xc0) // pand mm0,mm0
+RUAPU_INSTCODE(sse, 0x0f, 0x54, 0xc0) // andps xmm0,xmm0
+RUAPU_INSTCODE(sse2, 0x66, 0x0f, 0xfe, 0xc0) // paddd xmm0,xmm0
+RUAPU_INSTCODE(sse3, 0xf2, 0x0f, 0x7c, 0xc0) // haddps xmm0,xmm0
+RUAPU_INSTCODE(ssse3, 0x66, 0x0f, 0x38, 0x06, 0xc0) // phsubd xmm0,xmm0
+RUAPU_INSTCODE(sse41, 0x66, 0x0f, 0x38, 0x3d, 0xc0) // pmaxsd xmm0,xmm0
+RUAPU_INSTCODE(sse42, 0x66, 0x0f, 0x38, 0x37, 0xc0) // pcmpgtq xmm0,xmm0
+RUAPU_INSTCODE(sse4a, 0x66, 0x0f, 0x79, 0xc0) // extrq xmm0,xmm0
+RUAPU_INSTCODE(xop, 0x8f, 0xe8, 0x78, 0xb6, 0xc0, 0x00) // vpmadcswd xmm0,xmm0,xmm0,xmm0
+RUAPU_INSTCODE(avx, 0xc5, 0xfc, 0x54, 0xc0) // vandps ymm0,ymm0,ymm0
+RUAPU_INSTCODE(f16c, 0xc4, 0xe2, 0x7d, 0x13, 0xc0) // vcvtph2ps ymm0,xmm0
+RUAPU_INSTCODE(fma, 0xc4, 0xe2, 0x7d, 0x98, 0xc0) // vfmadd132ps ymm0,ymm0,ymm0
+RUAPU_INSTCODE(fma4, 0xc4, 0xe3, 0xfd, 0x68, 0xc0, 0x00) // vfmaddps ymm0,ymm0,ymm0,ymm0
+RUAPU_INSTCODE(avx2, 0xc5, 0xfd, 0xfe, 0xc0) // vpaddd ymm0,ymm0,ymm0
+RUAPU_INSTCODE(avx512f, 0x62, 0xf1, 0x7c, 0x48, 0x58, 0xc0) // vaddps zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512bw, 0x62, 0xf1, 0x7d, 0x48, 0xfd, 0xc0) // vpaddw zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512cd, 0x62, 0xf2, 0xfd, 0x48, 0x44, 0xc0) // vplzcntq zmm0,zmm0
+RUAPU_INSTCODE(avx512dq, 0x62, 0xf1, 0x7c, 0x48, 0x54, 0xc0) // vandps zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512vl, 0x62, 0xf2, 0xfd, 0x28, 0x1f, 0xc0) // vpabsq ymm0,ymm0
+RUAPU_INSTCODE(avx512vnni, 0x62, 0xf2, 0x7d, 0x48, 0x52, 0xc0) // vpdpwssd zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512bf16, 0x62, 0xf2, 0x7e, 0x48, 0x52, 0xc0) // vdpbf16ps zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512ifma, 0x62, 0xf2, 0xfd, 0x48, 0xb4, 0xc0) // vpmadd52luq zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512vbmi, 0x62, 0xf2, 0x7d, 0x48, 0x75, 0xc0) // vpermi2b zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512vbmi2, 0x62, 0xf2, 0x7d, 0x48, 0x71, 0xc0) // vpshldvd zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512fp16, 0x62, 0xf6, 0x7d, 0x48, 0x98, 0xc0) // vfmadd132ph zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avxvnni, 0xc4, 0xe2, 0x7d, 0x52, 0xc0) // vpdpwssd ymm0,ymm0,ymm0
+RUAPU_INSTCODE(avxvnniint8, 0xc4, 0xe2, 0x7f, 0x50, 0xc0) // vpdpbssd ymm0,ymm0,ymm0
+RUAPU_INSTCODE(avxifma, 0xc4, 0xe2, 0xfd, 0xb4, 0xc0) // vpmadd52luq ymm0,ymm0,ymm0
+
+#elif __aarch64__ || defined(_M_ARM64)
+RUAPU_INSTCODE(neon, 0x4e20d400) // fadd v0.4s,v0.4s,v0.4s
+RUAPU_INSTCODE(vfpv4, 0x0e216800) // fcvtn v0.4h,v0.4s
+RUAPU_INSTCODE(cpuid, 0xd5380000) // mrs x0,midr_el1
+RUAPU_INSTCODE(asimdhp, 0x0e401400) // fadd v0.4h,v0.4h,v0.4h
+RUAPU_INSTCODE(asimddp, 0x4e809400) // sdot v0.4h,v0.16b,v0.16b
+RUAPU_INSTCODE(asimdfhm, 0x4e20ec00) // fmlal v0.4s,v0.4h,v0.4h
+RUAPU_INSTCODE(bf16, 0x6e40ec00) // bfmmla v0.4h,v0.8h,v0.8h
+RUAPU_INSTCODE(i8mm, 0x4e80a400) // smmla v0.4h,v0.16b,v0.16b
+RUAPU_INSTCODE(sve, 0x65608000) // fmad z0.h,p0/m,z0.h,z0.h
+RUAPU_INSTCODE(sve2, 0x44405000) // smlslb z0.h,z0.b,z0.b
+RUAPU_INSTCODE(svebf16, 0x6460e400) // bfmmla z0.s,z0.h,z0.h
+RUAPU_INSTCODE(svei8mm, 0x45009800) // smmla z0.s,z0.b,z0.b
+RUAPU_INSTCODE(svef32mm, 0x64a0e400) // fmmla z0.s,z0.s,z0.s
+
+#elif __arm__ || defined(_M_ARM)
+#if __thumb__
+RUAPU_INSTCODE(edsp, 0xfb20, 0x0000) // smlad r0,r0,r0,r0
+RUAPU_INSTCODE(neon, 0xef00, 0x0d40) // vadd.f32 q0,q0,q0
+RUAPU_INSTCODE(vfpv4, 0xffb6, 0x0600) // vcvt.f16.f32 d0,q0
+#else
+RUAPU_INSTCODE(edsp, 0xe7000010) // smlad r0,r0,r0,r0
+RUAPU_INSTCODE(neon, 0xf2000d40) // vadd.f32 q0,q0,q0
+RUAPU_INSTCODE(vfpv4, 0xf3b60600) // vcvt.f16.f32 d0,q0
+#endif
+
+#endif
+
+#undef RUAPU_INSTCODE
+
+#define RUAPU_ISAENTRY(isa) { #isa, (ruapu_some_inst)ruapu_some_##isa, 0 },
+
+struct ruapu_isa_entry g_ruapu_isa_map[] = {
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+RUAPU_ISAENTRY(mmx)
+RUAPU_ISAENTRY(sse)
+RUAPU_ISAENTRY(sse2)
+RUAPU_ISAENTRY(sse3)
+RUAPU_ISAENTRY(ssse3)
+RUAPU_ISAENTRY(sse41)
+RUAPU_ISAENTRY(sse42)
+RUAPU_ISAENTRY(sse4a)
+RUAPU_ISAENTRY(xop)
+RUAPU_ISAENTRY(avx)
+RUAPU_ISAENTRY(f16c)
+RUAPU_ISAENTRY(fma)
+RUAPU_ISAENTRY(fma4)
+RUAPU_ISAENTRY(avx2)
+RUAPU_ISAENTRY(avx512f)
+RUAPU_ISAENTRY(avx512bw)
+RUAPU_ISAENTRY(avx512cd)
+RUAPU_ISAENTRY(avx512dq)
+RUAPU_ISAENTRY(avx512vl)
+RUAPU_ISAENTRY(avx512vnni)
+RUAPU_ISAENTRY(avx512bf16)
+RUAPU_ISAENTRY(avx512ifma)
+RUAPU_ISAENTRY(avx512vbmi)
+RUAPU_ISAENTRY(avx512vbmi2)
+RUAPU_ISAENTRY(avx512fp16)
+RUAPU_ISAENTRY(avxvnni)
+RUAPU_ISAENTRY(avxvnniint8)
+RUAPU_ISAENTRY(avxifma)
+
+#elif __aarch64__ || defined(_M_ARM64)
+RUAPU_ISAENTRY(neon)
+RUAPU_ISAENTRY(vfpv4)
+RUAPU_ISAENTRY(cpuid)
+RUAPU_ISAENTRY(asimdhp)
+RUAPU_ISAENTRY(asimddp)
+RUAPU_ISAENTRY(asimdfhm)
+RUAPU_ISAENTRY(bf16)
+RUAPU_ISAENTRY(i8mm)
+RUAPU_ISAENTRY(sve)
+RUAPU_ISAENTRY(sve2)
+RUAPU_ISAENTRY(svebf16)
+RUAPU_ISAENTRY(svei8mm)
+RUAPU_ISAENTRY(svef32mm)
+
+#elif __arm__ || defined(_M_ARM)
+RUAPU_ISAENTRY(edsp)
+RUAPU_ISAENTRY(neon)
+RUAPU_ISAENTRY(vfpv4)
+
+#endif
+};
+
+#undef RUAPU_ISAENTRY
+
+void ruapu_init()
+{
+ for (size_t i = 0; i < sizeof(g_ruapu_isa_map) / sizeof(g_ruapu_isa_map[0]); i++)
+ {
+ g_ruapu_isa_map[i].capable = ruapu_detect_isa(g_ruapu_isa_map[i].inst);
+ }
+}
+
+int ruapu_supports(const char* isa)
+{
+ for (size_t i = 0; i < sizeof(g_ruapu_isa_map) / sizeof(g_ruapu_isa_map[0]); i++)
+ {
+ if (strcmp(g_ruapu_isa_map[i].isa, isa) == 0)
+ {
+ return g_ruapu_isa_map[i].capable;
+ }
+ }
+
+ return 0;
+}
+
+#endif // RUAPU_IMPLEMENTATION
+
+#endif // RUAPU_H
diff --git a/src/simplevk.cpp b/src/simplevk.cpp
index b4d1d778da8..4cd23b3c81b 100644
--- a/src/simplevk.cpp
+++ b/src/simplevk.cpp
@@ -316,6 +316,12 @@ static int load_vulkan_linux(const char* driver_path)
#endif
void* libvulkan = dlopen(libpath, RTLD_LOCAL | RTLD_NOW);
+#if !__APPLE__
+ if (!libvulkan)
+ {
+ libvulkan = dlopen("libvulkan.so.1", RTLD_LOCAL | RTLD_NOW);
+ }
+#endif
if (!libvulkan)
{
NCNN_LOGE("dlopen failed %s", dlerror());
diff --git a/src/stb_image.h b/src/stb_image.h
index 1b4b337328e..1c2096a3a0d 100644
--- a/src/stb_image.h
+++ b/src/stb_image.h
@@ -786,6 +786,20 @@ static int stbi__sse2_available(void)
#endif
#endif
+// RISC-V VECTOR
+#if defined(STBI_NO_SIMD) && defined(STBI_RVV)
+#undef STBI_RVV
+#endif
+
+#ifdef STBI_RVV
+#include
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
#ifndef STBI_SIMD_ALIGN
#define STBI_SIMD_ALIGN(type, name) type name
#endif
@@ -2910,6 +2924,180 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
#endif // STBI_NEON
+#ifdef STBI_RVV
+
+// risc-v vector integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+ const short rot0_0 = stbi__f2f(0.5411961f);
+ const short rot0_1 = stbi__f2f(-1.847759065f);
+ const short rot0_2 = stbi__f2f( 0.765366865f);
+ const short rot1_0 = stbi__f2f( 1.175875602f);
+ const short rot1_1 = stbi__f2f(-0.899976223f);
+ const short rot1_2 = stbi__f2f(-2.562915447f);
+ const short rot2_0 = stbi__f2f(-1.961570560f);
+ const short rot2_1 = stbi__f2f(-0.390180644f);
+ const short rot3_0 = stbi__f2f( 0.298631336f);
+ const short rot3_1 = stbi__f2f( 2.053119869f);
+ const short rot3_2 = stbi__f2f( 3.072711026f);
+ const short rot3_3 = stbi__f2f( 1.501321110f);
+
+ // scratch buffer for data transpose
+ short tmp[64];
+
+ const size_t vl = vsetvl_e16m1(8);
+
+ // column pass
+ {
+ vint16m1_t row0 = vle16_v_i16m1(data + 0*8, vl);
+ vint16m1_t row1 = vle16_v_i16m1(data + 1*8, vl);
+ vint16m1_t row2 = vle16_v_i16m1(data + 2*8, vl);
+ vint16m1_t row3 = vle16_v_i16m1(data + 3*8, vl);
+ vint16m1_t row4 = vle16_v_i16m1(data + 4*8, vl);
+ vint16m1_t row5 = vle16_v_i16m1(data + 5*8, vl);
+ vint16m1_t row6 = vle16_v_i16m1(data + 6*8, vl);
+ vint16m1_t row7 = vle16_v_i16m1(data + 7*8, vl);
+
+ // even part
+ vint16m1_t sum26 = vadd_vv_i16m1(row2, row6, vl);
+ vint32m2_t p1e = vwmul_vx_i32m2(sum26, rot0_0, vl);
+ vint32m2_t t2e = vwmacc_vx_i32m2(p1e, rot0_1, row6, vl);
+ vint32m2_t t3e = vwmacc_vx_i32m2(p1e, rot0_2, row2, vl);
+ vint32m2_t t0e = vsll_vx_i32m2(vwadd_vv_i32m2(row0, row4, vl), 12, vl);
+ vint32m2_t t1e = vsll_vx_i32m2(vwsub_vv_i32m2(row0, row4, vl), 12, vl);
+ vint32m2_t x0 = vadd_vv_i32m2(t0e, t3e, vl);
+ vint32m2_t x3 = vsub_vv_i32m2(t0e, t3e, vl);
+ vint32m2_t x1 = vadd_vv_i32m2(t1e, t2e, vl);
+ vint32m2_t x2 = vsub_vv_i32m2(t1e, t2e, vl);
+
+ // odd part
+ vint16m1_t sum15 = vadd_vv_i16m1(row1, row5, vl);
+ vint16m1_t sum17 = vadd_vv_i16m1(row1, row7, vl);
+ vint16m1_t sum35 = vadd_vv_i16m1(row3, row5, vl);
+ vint16m1_t sum37 = vadd_vv_i16m1(row3, row7, vl);
+ vint32m2_t p5o = vwmul_vx_i32m2(vadd_vv_i16m1(sum17, sum35, vl), rot1_0, vl);
+ vint32m2_t p1o = vwmacc_vx_i32m2(p5o, rot1_1, sum17, vl);
+ vint32m2_t p2o = vwmacc_vx_i32m2(p5o, rot1_2, sum35, vl);
+ vint32m2_t p3o = vwmul_vx_i32m2(sum37, rot2_0, vl);
+ vint32m2_t p4o = vwmul_vx_i32m2(sum15, rot2_1, vl);
+ vint32m2_t sump13o = vadd_vv_i32m2(p1o, p3o, vl);
+ vint32m2_t sump24o = vadd_vv_i32m2(p2o, p4o, vl);
+ vint32m2_t sump23o = vadd_vv_i32m2(p2o, p3o, vl);
+ vint32m2_t sump14o = vadd_vv_i32m2(p1o, p4o, vl);
+ vint32m2_t x4 = vwmacc_vx_i32m2(sump13o, rot3_0, row7, vl);
+ vint32m2_t x5 = vwmacc_vx_i32m2(sump24o, rot3_1, row5, vl);
+ vint32m2_t x6 = vwmacc_vx_i32m2(sump23o, rot3_2, row3, vl);
+ vint32m2_t x7 = vwmacc_vx_i32m2(sump14o, rot3_3, row1, vl);
+
+ // bfly32
+ x0 = vadd_vx_i32m2(x0, 512, vl);
+ x1 = vadd_vx_i32m2(x1, 512, vl);
+ x2 = vadd_vx_i32m2(x2, 512, vl);
+ x3 = vadd_vx_i32m2(x3, 512, vl);
+ vint16m1_t out0 = vnsra_wx_i16m1(vadd_vv_i32m2(x0, x7, vl), 10, vl);
+ vint16m1_t out7 = vnsra_wx_i16m1(vsub_vv_i32m2(x0, x7, vl), 10, vl);
+ vint16m1_t out1 = vnsra_wx_i16m1(vadd_vv_i32m2(x1, x6, vl), 10, vl);
+ vint16m1_t out6 = vnsra_wx_i16m1(vsub_vv_i32m2(x1, x6, vl), 10, vl);
+ vint16m1_t out2 = vnsra_wx_i16m1(vadd_vv_i32m2(x2, x5, vl), 10, vl);
+ vint16m1_t out5 = vnsra_wx_i16m1(vsub_vv_i32m2(x2, x5, vl), 10, vl);
+ vint16m1_t out3 = vnsra_wx_i16m1(vadd_vv_i32m2(x3, x4, vl), 10, vl);
+ vint16m1_t out4 = vnsra_wx_i16m1(vsub_vv_i32m2(x3, x4, vl), 10, vl);
+
+ // 8x8 transpose
+ // I would prefer to implement this transpose in register without save+load,
+ // but rvv does not have shuffle/zip instructions among multiple registers.
+ // what a pity :( --- nihui
+ vsse16_v_i16m1(tmp + 0, 8 * 2, out0, vl);
+ vsse16_v_i16m1(tmp + 1, 8 * 2, out1, vl);
+ vsse16_v_i16m1(tmp + 2, 8 * 2, out2, vl);
+ vsse16_v_i16m1(tmp + 3, 8 * 2, out3, vl);
+ vsse16_v_i16m1(tmp + 4, 8 * 2, out4, vl);
+ vsse16_v_i16m1(tmp + 5, 8 * 2, out5, vl);
+ vsse16_v_i16m1(tmp + 6, 8 * 2, out6, vl);
+ vsse16_v_i16m1(tmp + 7, 8 * 2, out7, vl);
+ }
+
+ // row pass
+ {
+ vint16m1_t row0 = vle16_v_i16m1(tmp + 0*8, vl);
+ vint16m1_t row1 = vle16_v_i16m1(tmp + 1*8, vl);
+ vint16m1_t row2 = vle16_v_i16m1(tmp + 2*8, vl);
+ vint16m1_t row3 = vle16_v_i16m1(tmp + 3*8, vl);
+ vint16m1_t row4 = vle16_v_i16m1(tmp + 4*8, vl);
+ vint16m1_t row5 = vle16_v_i16m1(tmp + 5*8, vl);
+ vint16m1_t row6 = vle16_v_i16m1(tmp + 6*8, vl);
+ vint16m1_t row7 = vle16_v_i16m1(tmp + 7*8, vl);
+
+ // even part
+ vint16m1_t sum26 = vadd_vv_i16m1(row2, row6, vl);
+ vint32m2_t p1e = vwmul_vx_i32m2(sum26, rot0_0, vl);
+ vint32m2_t t2e = vwmacc_vx_i32m2(p1e, rot0_1, row6, vl);
+ vint32m2_t t3e = vwmacc_vx_i32m2(p1e, rot0_2, row2, vl);
+ vint32m2_t t0e = vsll_vx_i32m2(vwadd_vv_i32m2(row0, row4, vl), 12, vl);
+ vint32m2_t t1e = vsll_vx_i32m2(vwsub_vv_i32m2(row0, row4, vl), 12, vl);
+ vint32m2_t x0 = vadd_vv_i32m2(t0e, t3e, vl);
+ vint32m2_t x3 = vsub_vv_i32m2(t0e, t3e, vl);
+ vint32m2_t x1 = vadd_vv_i32m2(t1e, t2e, vl);
+ vint32m2_t x2 = vsub_vv_i32m2(t1e, t2e, vl);
+
+ // odd part
+ vint16m1_t sum15 = vadd_vv_i16m1(row1, row5, vl);
+ vint16m1_t sum17 = vadd_vv_i16m1(row1, row7, vl);
+ vint16m1_t sum35 = vadd_vv_i16m1(row3, row5, vl);
+ vint16m1_t sum37 = vadd_vv_i16m1(row3, row7, vl);
+ vint32m2_t p5o = vwmul_vx_i32m2(vadd_vv_i16m1(sum17, sum35, vl), rot1_0, vl);
+ vint32m2_t p1o = vwmacc_vx_i32m2(p5o, rot1_1, sum17, vl);
+ vint32m2_t p2o = vwmacc_vx_i32m2(p5o, rot1_2, sum35, vl);
+ vint32m2_t p3o = vwmul_vx_i32m2(sum37, rot2_0, vl);
+ vint32m2_t p4o = vwmul_vx_i32m2(sum15, rot2_1, vl);
+ vint32m2_t sump13o = vadd_vv_i32m2(p1o, p3o, vl);
+ vint32m2_t sump24o = vadd_vv_i32m2(p2o, p4o, vl);
+ vint32m2_t sump23o = vadd_vv_i32m2(p2o, p3o, vl);
+ vint32m2_t sump14o = vadd_vv_i32m2(p1o, p4o, vl);
+ vint32m2_t x4 = vwmacc_vx_i32m2(sump13o, rot3_0, row7, vl);
+ vint32m2_t x5 = vwmacc_vx_i32m2(sump24o, rot3_1, row5, vl);
+ vint32m2_t x6 = vwmacc_vx_i32m2(sump23o, rot3_2, row3, vl);
+ vint32m2_t x7 = vwmacc_vx_i32m2(sump14o, rot3_3, row1, vl);
+
+ // bfly32
+ x0 = vadd_vx_i32m2(x0, (int)(65536 + (128<<17)), vl);
+ x1 = vadd_vx_i32m2(x1, (int)(65536 + (128<<17)), vl);
+ x2 = vadd_vx_i32m2(x2, (int)(65536 + (128<<17)), vl);
+ x3 = vadd_vx_i32m2(x3, (int)(65536 + (128<<17)), vl);
+ vint16m1_t out0 = vnsra_wx_i16m1(vadd_vv_i32m2(x0, x7, vl), 17, vl);
+ vint16m1_t out7 = vnsra_wx_i16m1(vsub_vv_i32m2(x0, x7, vl), 17, vl);
+ vint16m1_t out1 = vnsra_wx_i16m1(vadd_vv_i32m2(x1, x6, vl), 17, vl);
+ vint16m1_t out6 = vnsra_wx_i16m1(vsub_vv_i32m2(x1, x6, vl), 17, vl);
+ vint16m1_t out2 = vnsra_wx_i16m1(vadd_vv_i32m2(x2, x5, vl), 17, vl);
+ vint16m1_t out5 = vnsra_wx_i16m1(vsub_vv_i32m2(x2, x5, vl), 17, vl);
+ vint16m1_t out3 = vnsra_wx_i16m1(vadd_vv_i32m2(x3, x4, vl), 17, vl);
+ vint16m1_t out4 = vnsra_wx_i16m1(vsub_vv_i32m2(x3, x4, vl), 17, vl);
+
+ // clamp 0~255
+ vuint8m1_t out0u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out0, 0, vl)), 0, vl);
+ vuint8m1_t out7u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out7, 0, vl)), 0, vl);
+ vuint8m1_t out1u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out1, 0, vl)), 0, vl);
+ vuint8m1_t out6u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out6, 0, vl)), 0, vl);
+ vuint8m1_t out2u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out2, 0, vl)), 0, vl);
+ vuint8m1_t out5u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out5, 0, vl)), 0, vl);
+ vuint8m1_t out3u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out3, 0, vl)), 0, vl);
+ vuint8m1_t out4u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out4, 0, vl)), 0, vl);
+
+ // 8x8 transpose
+ vsse8_v_u8m1(out + 0, out_stride, out0u8, vl);
+ vsse8_v_u8m1(out + 1, out_stride, out1u8, vl);
+ vsse8_v_u8m1(out + 2, out_stride, out2u8, vl);
+ vsse8_v_u8m1(out + 3, out_stride, out3u8, vl);
+ vsse8_v_u8m1(out + 4, out_stride, out4u8, vl);
+ vsse8_v_u8m1(out + 5, out_stride, out5u8, vl);
+ vsse8_v_u8m1(out + 6, out_stride, out6u8, vl);
+ vsse8_v_u8m1(out + 7, out_stride, out7u8, vl);
+ }
+}
+
+#endif // STBI_RVV
+
#define STBI__MARKER_none 0xff
// if there's a pending marker from the entropy stream, return that
// otherwise, fetch from the stream and get a marker. if there's no
@@ -3524,7 +3712,7 @@ static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc
return out;
}
-#if defined(STBI_SSE2) || defined(STBI_NEON)
+#if defined(STBI_SSE2) || defined(STBI_NEON) || defined(STBI_RVV)
static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
{
// need to generate 2x2 samples for every one in input
@@ -3536,6 +3724,48 @@ static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stb
}
t1 = 3*in_near[0] + in_far[0];
+#if defined(STBI_RVV)
+ // process groups of vl*4 pixels for as long as we can.
+ // note we can't handle the last pixel in a row in this loop
+ // because we need to handle the filter boundary conditions.
+ int n = w-1;
+ while (n > 0) {
+ size_t vl = vsetvl_e8m4(n);
+
+ // load and perform the vertical filtering pass
+ vuint8m4_t farb = vle8_v_u8m4(in_far + i, vl);
+ vuint8m4_t nearb = vle8_v_u8m4(in_near + i, vl);
+ vuint16m8_t curr = vadd_vv_u16m8(vwmulu_vx_u16m8(nearb, 3, vl), vwcvtu_x_x_v_u16m8(farb, vl), vl); // current row
+
+ // horizontal filter works the same based on shifted vers of current
+ // row. "prev" is current row shifted right by 1 pixel; we need to
+ // insert the previous pixel value (from t1).
+ // "next" is current row shifted left by 1 pixel, with first pixel
+ // of next block of vl*4 pixels added in.
+ vuint16m8_t prev = vslide1up_vx_u16m8(curr, t1, vl);
+ vuint16m8_t next = vslide1down_vx_u16m8(curr, 3*in_near[i+vl] + in_far[i+vl], vl);
+
+ // horizontal filter, polyphase implementation since it's convenient:
+ // even pixels = 3*cur + prev + 8
+ // odd pixels = 3*cur + next + 8
+ // note the shared term.
+ vuint16m8_t curs = vmacc_vx_u16m8(vmv_v_x_u16m8(8, vl), 3, curr, vl);
+ vuint16m8_t even = vadd_vv_u16m8(curs, prev, vl);
+ vuint16m8_t odd = vadd_vv_u16m8(curs, next, vl);
+
+ // undo scaling and round, then store with even/odd phases interleaved
+ vuint8m4_t evenu8 = vnclipu_wx_u8m4(vsrl_vx_u16m8(even, 4, vl), 0, vl);
+ vuint8m4_t oddu8 = vnclipu_wx_u8m4(vsrl_vx_u16m8(odd, 4, vl), 0, vl);
+ vuint8m4x2_t o = vcreate_u8m4x2(evenu8, oddu8);
+ vsseg2e8_v_u8m4x2(out + i*2, o, vl);
+
+ // "previous" value for next iter
+ t1 = 3*in_near[i+(vl-1)] + in_far[i+(vl-1)];
+
+ i += vl;
+ n -= vl;
+ }
+#else
// process groups of 8 pixels for as long as we can.
// note we can't handle the last pixel in a row in this loop
// because we need to handle the filter boundary conditions.
@@ -3622,6 +3852,7 @@ static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stb
// "previous" value for next iter
t1 = 3*in_near[i+7] + in_far[i+7];
}
+#endif
t0 = t1;
t1 = 3*in_near[i] + in_far[i];
@@ -3680,7 +3911,7 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
}
}
-#if defined(STBI_SSE2) || defined(STBI_NEON)
+#if defined(STBI_SSE2) || defined(STBI_NEON) || defined(STBI_RVV)
static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
{
int i = 0;
@@ -3747,7 +3978,47 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
#endif
#ifdef STBI_NEON
- // in this version, step=3 support would be easy to add. but is there demand?
+ if (step == 3) {
+ // this is a fairly straightforward implementation and not super-optimized.
+ uint8x8_t signflip = vdup_n_u8(0x80);
+ int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f));
+ int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+ int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+ int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f));
+
+ for (; i+7 < count; i += 8) {
+ // load
+ uint8x8_t y_bytes = vld1_u8(y + i);
+ uint8x8_t cr_bytes = vld1_u8(pcr + i);
+ uint8x8_t cb_bytes = vld1_u8(pcb + i);
+ int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+ int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+ // expand to s16
+ int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+ int16x8_t crw = vshll_n_s8(cr_biased, 7);
+ int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+ // color transform
+ int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+ int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+ int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+ int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+ int16x8_t rws = vaddq_s16(yws, cr0);
+ int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+ int16x8_t bws = vaddq_s16(yws, cb1);
+
+ // undo scaling, round, convert to byte
+ uint8x8x3_t o;
+ o.val[0] = vqrshrun_n_s16(rws, 4);
+ o.val[1] = vqrshrun_n_s16(gws, 4);
+ o.val[2] = vqrshrun_n_s16(bws, 4);
+
+ // store, interleaving r/g/b
+ vst3_u8(out, o);
+ out += 8*3;
+ }
+ }
if (step == 4) {
// this is a fairly straightforward implementation and not super-optimized.
uint8x8_t signflip = vdup_n_u8(0x80);
@@ -3792,6 +4063,104 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
}
#endif
+#ifdef STBI_RVV
+ if (step == 3) {
+ // this is a fairly straightforward implementation and not super-optimized.
+ const unsigned char signflip = 0x80;
+ const short cr_const0 = (short) ( 1.40200f*4096.0f+0.5f);
+ const short cr_const1 = - (short) ( 0.71414f*4096.0f+0.5f);
+ const short cb_const0 = - (short) ( 0.34414f*4096.0f+0.5f);
+ const short cb_const1 = (short) ( 1.77200f*4096.0f+0.5f);
+
+ int n = count;
+ while (n > 0) {
+ size_t vl = vsetvl_e8m2(n);
+
+ // load
+ vuint8m2_t y_bytes = vle8_v_u8m2(y + i, vl);
+ vuint8m2_t cr_bytes = vle8_v_u8m2(pcr + i, vl);
+ vuint8m2_t cb_bytes = vle8_v_u8m2(pcb + i, vl);
+ vint8m2_t cr_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cr_bytes, signflip, vl));
+ vint8m2_t cb_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cb_bytes, signflip, vl));
+
+ // expand to s16
+ vint16m4_t yws = vadd_vx_i16m4(vsll_vx_i16m4(vreinterpret_v_u16m4_i16m4(vwcvtu_x_x_v_u16m4(y_bytes, vl)), 4, vl), 8, vl);
+ vint16m4_t crw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cr_biased, vl), 8, vl);
+ vint16m4_t cbw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cb_biased, vl), 8, vl);
+
+ // color transform
+ vint16m4_t cr0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const0, vl), 16, vl), 0, vl);
+ vint16m4_t cb0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const0, vl), 16, vl), 0, vl);
+ vint16m4_t cr1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const1, vl), 16, vl), 0, vl);
+ vint16m4_t cb1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const1, vl), 16, vl), 0, vl);
+ vint16m4_t rws = vadd_vv_i16m4(yws, cr0, vl);
+ vint16m4_t gws = vadd_vv_i16m4(vadd_vv_i16m4(yws, cb0, vl), cr1, vl);
+ vint16m4_t bws = vadd_vv_i16m4(yws, cb1, vl);
+
+ // undo scaling, round, convert to byte
+ vuint8m2_t rb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(rws, 4, vl), 0, vl)), 0, vl);
+ vuint8m2_t gb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(gws, 4, vl), 0, vl)), 0, vl);
+ vuint8m2_t bb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(bws, 4, vl), 0, vl)), 0, vl);
+
+ // store, interleaving r/g/b
+ vuint8m2x3_t o = vcreate_u8m2x3(rb, gb, bb);
+ vsseg3e8_v_u8m2x3(out, o, vl);
+ out += vl*3;
+
+ i += vl;
+ n -= vl;
+ }
+ }
+ if (step == 4) {
+ // this is a fairly straightforward implementation and not super-optimized.
+ const unsigned char signflip = 128;
+ const short cr_const0 = (short) ( 1.40200f*4096.0f+0.5f);
+ const short cr_const1 = - (short) ( 0.71414f*4096.0f+0.5f);
+ const short cb_const0 = - (short) ( 0.34414f*4096.0f+0.5f);
+ const short cb_const1 = (short) ( 1.77200f*4096.0f+0.5f);
+
+ int n = count;
+ while (n > 0) {
+ size_t vl = vsetvl_e8m1(n);
+
+ // load
+ vuint8m2_t y_bytes = vle8_v_u8m2(y + i, vl);
+ vuint8m2_t cr_bytes = vle8_v_u8m2(pcr + i, vl);
+ vuint8m2_t cb_bytes = vle8_v_u8m2(pcb + i, vl);
+ vint8m2_t cr_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cr_bytes, signflip, vl));
+ vint8m2_t cb_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cb_bytes, signflip, vl));
+
+ // expand to s16
+ vint16m4_t yws = vadd_vx_i16m4(vsll_vx_i16m4(vreinterpret_v_u16m4_i16m4(vwcvtu_x_x_v_u16m4(y_bytes, vl)), 4, vl), 8, vl);
+ vint16m4_t crw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cr_biased, vl), 8, vl);
+ vint16m4_t cbw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cb_biased, vl), 8, vl);
+
+ // color transform
+ vint16m4_t cr0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const0, vl), 16, vl), 0, vl);
+ vint16m4_t cb0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const0, vl), 16, vl), 0, vl);
+ vint16m4_t cr1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const1, vl), 16, vl), 0, vl);
+ vint16m4_t cb1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const1, vl), 16, vl), 0, vl);
+ vint16m4_t rws = vadd_vv_i16m4(yws, cr0, vl);
+ vint16m4_t gws = vadd_vv_i16m4(vadd_vv_i16m4(yws, cb0, vl), cr1, vl);
+ vint16m4_t bws = vadd_vv_i16m4(yws, cb1, vl);
+
+ // undo scaling, round, convert to byte
+ vuint8m2_t rb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(rws, 4, vl), 0, vl)), 0, vl);
+ vuint8m2_t gb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(gws, 4, vl), 0, vl)), 0, vl);
+ vuint8m2_t bb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(bws, 4, vl), 0, vl)), 0, vl);
+ vuint8m2_t ab = vmv_v_x_u8m2(255, vl);
+
+ // store, interleaving r/g/b/a
+ vuint8m2x4_t o = vcreate_u8m2x4(rb, gb, bb, ab);
+ vsseg4e8_v_u8m2x4(out, o, vl);
+ out += vl*4;
+
+ i += vl;
+ n -= vl;
+ }
+ }
+#endif
+
for (; i < count; ++i) {
int y_fixed = (y[i] << 20) + (1<<19); // rounding
int r,g,b;
@@ -3835,6 +4204,12 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
#endif
+
+#ifdef STBI_RVV
+ j->idct_block_kernel = stbi__idct_simd;
+ j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+ j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
}
// clean up the temporary component buffers
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index bef56d44a58..d30229b870c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,9 +4,12 @@ if(MSVC)
add_definitions(/wd4996)
endif()
+add_library(ncnntestutil STATIC testutil.cpp)
+target_link_libraries(ncnntestutil PUBLIC ncnn)
+
macro(ncnn_add_test name)
add_executable(test_${name} test_${name}.cpp)
- target_link_libraries(test_${name} PRIVATE ncnn)
+ target_link_libraries(test_${name} PRIVATE ncnntestutil ncnn)
add_test(NAME test_${name} COMMAND ${CMAKE_COMMAND} -DTEST_EXECUTABLE=$ -P ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/run_test.cmake)
@@ -24,7 +27,7 @@ macro(ncnn_add_layer_test class)
foreach(test_file ${test_${name}_SRCS})
get_filename_component(test_filename ${test_file} NAME_WE)
add_executable(${test_filename} ${test_file})
- target_link_libraries(${test_filename} PRIVATE ncnn)
+ target_link_libraries(${test_filename} PRIVATE ncnntestutil ncnn)
add_test(NAME ${test_filename} COMMAND ${CMAKE_COMMAND} -DTEST_EXECUTABLE=$ |