diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml
index 16602627c43..7298b8b6ab7 100644
--- a/.ci/pnnx.yml
+++ b/.ci/pnnx.yml
@@ -52,6 +52,10 @@ jobs:
             torchvision-version: 0.16.0
             torchvision-cache-key: '0_16_0'
 
+          - torch-version: 2.2.1
+            torchvision-version: 0.17.1
+            torchvision-cache-key: '0_17_1'
+
     runs-on:
       pool-name: docker
       container:
@@ -122,6 +126,7 @@ jobs:
     - name: test
       run: |
         export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
+        export LD_LIBRARY_PATH=${{ci.workspace}}/torchvision-${{matrix.torchvision-version}}-install/lib
         export OMP_NUM_THREADS=1
         export MKL_NUM_THREADS=1
         export MKL_ENABLE_INSTRUCTIONS=SSE4_2
@@ -131,8 +136,9 @@ jobs:
     - name: python-pnnx
       run: |
         export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
+        export LD_LIBRARY_PATH=${{ci.workspace}}/torchvision-${{matrix.torchvision-version}}-install/lib
         export PNNX_WHEEL_WITHOUT_BUILD=ON
-        cd tools/pnnx
-        cp build/src/pnnx python/pnnx/
+        cd tools/pnnx/python
+        cp ../build/src/pnnx pnnx/
         python3 setup.py install --user
-        pytest python/tests/
+        pytest tests
diff --git a/.github/workflows/android-armv7-cpu.yml b/.github/workflows/android-armv7-cpu.yml
index 42f85f60f3a..3cddc846389 100644
--- a/.github/workflows/android-armv7-cpu.yml
+++ b/.github/workflows/android-armv7-cpu.yml
@@ -33,12 +33,12 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
 
     - name: ndk-r16b
       run: |
@@ -48,9 +48,9 @@ jobs:
       run: |
         mkdir build-noneon && cd build-noneon
         cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-14 ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-noneon-shared
       run: |
         mkdir build-noneon-shared && cd build-noneon-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-armv7-gpu.yml b/.github/workflows/android-armv7-gpu.yml
index 9507d4c2798..d416f7eaf6d 100644
--- a/.github/workflows/android-armv7-gpu.yml
+++ b/.github/workflows/android-armv7-gpu.yml
@@ -37,9 +37,9 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON -DNCNN_ENABLE_LTO=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-armv8-cpu.yml b/.github/workflows/android-armv8-cpu.yml
index fa920f7aa19..98deabac44b 100644
--- a/.github/workflows/android-armv8-cpu.yml
+++ b/.github/workflows/android-armv8-cpu.yml
@@ -33,9 +33,9 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-armv8-gpu.yml b/.github/workflows/android-armv8-gpu.yml
index a1cb55104c8..43ff4cee2de 100644
--- a/.github/workflows/android-armv8-gpu.yml
+++ b/.github/workflows/android-armv8-gpu.yml
@@ -37,27 +37,27 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-termux
       run: |
         mkdir build-termux && cd build-termux
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON -DNCNN_PLATFORM_API=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-android-29
       run: |
         mkdir build-android-29 && cd build-android-29
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-29 -DNCNN_VULKAN=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-android-29-shared
       run: |
         mkdir build-android-29-shared && cd build-android-29-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-29 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
 
   android-aarch64-gpu-ndk-r16b:
     runs-on: ubuntu-20.04
@@ -72,4 +72,4 @@ jobs:
     - name: configure
       run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
diff --git a/.github/workflows/android-x64-cpu.yml b/.github/workflows/android-x64-cpu.yml
index 55bf4018255..0c98606f6f3 100644
--- a/.github/workflows/android-x64-cpu.yml
+++ b/.github/workflows/android-x64-cpu.yml
@@ -33,9 +33,9 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-x64-gpu.yml b/.github/workflows/android-x64-gpu.yml
index 4c0b6237999..d1260fe7e94 100644
--- a/.github/workflows/android-x64-gpu.yml
+++ b/.github/workflows/android-x64-gpu.yml
@@ -37,9 +37,9 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-x86-cpu.yml b/.github/workflows/android-x86-cpu.yml
index cc49975a422..4dbf9b68cac 100644
--- a/.github/workflows/android-x86-cpu.yml
+++ b/.github/workflows/android-x86-cpu.yml
@@ -33,9 +33,9 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86"  -DANDROID_PLATFORM=android-14 ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86"  -DANDROID_PLATFORM=android-14 -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
diff --git a/.github/workflows/android-x86-gpu.yml b/.github/workflows/android-x86-gpu.yml
index 8b24690a765..6186968d316 100644
--- a/.github/workflows/android-x86-gpu.yml
+++ b/.github/workflows/android-x86-gpu.yml
@@ -37,9 +37,9 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
diff --git a/.github/workflows/code-format.yml b/.github/workflows/code-format.yml
index 8051371d3e5..c996e51c08e 100644
--- a/.github/workflows/code-format.yml
+++ b/.github/workflows/code-format.yml
@@ -19,7 +19,7 @@ jobs:
 
     - name: cache-clang-format
       id: cache-clang-format
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: clang-format-install
         key: clang-format-install-4
diff --git a/.github/workflows/ios-arm64-gpu.yml b/.github/workflows/ios-arm64-gpu.yml
index 907f466c386..25f8cf4ad35 100644
--- a/.github/workflows/ios-arm64-gpu.yml
+++ b/.github/workflows/ios-arm64-gpu.yml
@@ -44,7 +44,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-ios-install-20230504
diff --git a/.github/workflows/ios-cpu.yml b/.github/workflows/ios-cpu.yml
index 488b5fe88f7..501eac3377d 100644
--- a/.github/workflows/ios-cpu.yml
+++ b/.github/workflows/ios-cpu.yml
@@ -40,7 +40,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-ios-install-20230504
diff --git a/.github/workflows/ios-simulator-gpu.yml b/.github/workflows/ios-simulator-gpu.yml
index 9a26c0ef0b4..4babdb4e532 100644
--- a/.github/workflows/ios-simulator-gpu.yml
+++ b/.github/workflows/ios-simulator-gpu.yml
@@ -44,7 +44,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-ios-simulator-install-20230504
diff --git a/.github/workflows/ios-simulator.yml b/.github/workflows/ios-simulator.yml
index 7bb2a861f9b..1d550638313 100644
--- a/.github/workflows/ios-simulator.yml
+++ b/.github/workflows/ios-simulator.yml
@@ -42,7 +42,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-ios-simulator-install-20230504
diff --git a/.github/workflows/linux-aarch64-cpu-gcc.yml b/.github/workflows/linux-aarch64-cpu-gcc.yml
index a791da6c26a..0c1032bf9c0 100644
--- a/.github/workflows/linux-aarch64-cpu-gcc.yml
+++ b/.github/workflows/linux-aarch64-cpu-gcc.yml
@@ -36,7 +36,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-aarch64-install-20220502-ubuntu-2004-2
@@ -57,7 +57,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: aarch64-gnu-toolchain
@@ -69,34 +69,34 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_ARM82DOT=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
 
     - name: build-noint8
       run: |
         mkdir build-noint8 && cd build-noint8
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=OFF -DNCNN_ARM82DOT=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-noint8
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build-noint8
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
     
     - name: build-simplestl-simplemath
       run: |
         mkdir build-simplestl-simplemath && cd build-simplestl-simplemath 
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu-c.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-simplestl-simplemath
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build-simplestl-simplemath
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
 
   linux-gcc-arm82:
     runs-on: ubuntu-20.04
@@ -105,7 +105,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-aarch64-install-20220502-ubuntu-2004-2
@@ -126,7 +126,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: aarch64-gnu-toolchain
@@ -138,23 +138,23 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
 
     - name: build-noint8
       run: |
         mkdir build-noint8 && cd build-noint8
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-noint8
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build-noint8
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
 
   linux-gcc-arm86:
     runs-on: ubuntu-22.04
@@ -163,7 +163,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-aarch64-install-20230717
@@ -184,7 +184,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: aarch64-gnu-toolchain
@@ -196,9 +196,9 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-arm-cpu-gcc.yml b/.github/workflows/linux-arm-cpu-gcc.yml
index 8222586f129..19d9c1cb370 100644
--- a/.github/workflows/linux-arm-cpu-gcc.yml
+++ b/.github/workflows/linux-arm-cpu-gcc.yml
@@ -38,7 +38,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-arm-install-20220502-2
@@ -59,7 +59,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: arm-gnu-toolchain
@@ -71,23 +71,23 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc)
 
     - name: build-noint8
       run: |
         mkdir build-noint8 && cd build-noint8
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-noint8
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build-noint8
-        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc)
 
   linux-gcc-armhf:
     runs-on: ubuntu-20.04
@@ -96,7 +96,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-arm-install-20220502-2
@@ -117,7 +117,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: arm-gnu-toolchain
@@ -129,23 +129,23 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)
 
     - name: build-noint8
       run: |
         mkdir build-noint8 && cd build-noint8
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-noint8
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build-noint8
-        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)
 
   linux-gcc-armhf-vfpv3-d16:
     runs-on: ubuntu-20.04
@@ -154,7 +154,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-arm-install-20220502-2
@@ -175,7 +175,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: arm-gnu-toolchain
@@ -187,20 +187,20 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)
 
     - name: build-noint8
       run: |
         mkdir build-noint8 && cd build-noint8
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-noint8
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build-noint8
-        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-mips-cpu-gcc.yml b/.github/workflows/linux-mips-cpu-gcc.yml
index 7265d2ce0ee..f6e1e74792c 100644
--- a/.github/workflows/linux-mips-cpu-gcc.yml
+++ b/.github/workflows/linux-mips-cpu-gcc.yml
@@ -38,7 +38,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-mipsel-install-20220502-2
@@ -59,7 +59,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mipsel-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: mipsel-gnu-toolchain
@@ -70,13 +70,13 @@ jobs:
     - name: configure
       run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsel-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
 
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsel-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsel-linux-gnu" ctest --output-on-failure -j $(nproc)
 
   linux-gcc-mipsisa32r6el:
     runs-on: ubuntu-20.04
@@ -85,7 +85,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-mipsel-install-20220502-2
@@ -106,7 +106,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mipsel-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: mipsisa32r6el-gnu-toolchain
@@ -117,10 +117,10 @@ jobs:
     - name: configure
       run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
 
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-mips64-cpu-gcc.yml b/.github/workflows/linux-mips64-cpu-gcc.yml
index 5ca70798838..890f1054d5b 100644
--- a/.github/workflows/linux-mips64-cpu-gcc.yml
+++ b/.github/workflows/linux-mips64-cpu-gcc.yml
@@ -38,7 +38,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-mips64el-install-20220502-2
@@ -59,7 +59,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: mips64el-gnuabi64-toolchain
@@ -70,13 +70,13 @@ jobs:
     - name: configure
       run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mips64el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
 
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mips64el-linux-gnuabi64" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mips64el-linux-gnuabi64" ctest --output-on-failure -j $(nproc)
 
   linux-gcc-mipsisa64r6el:
     runs-on: ubuntu-20.04
@@ -85,7 +85,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-mips64el-install-20220502-4
@@ -118,7 +118,7 @@ jobs:
         patch -p1 -i 0005-target-mips-Fix-FTRUNC_S-and-FTRUNC_U-trans-helper.patch
         patch -p1 -i 0006-target-mips-Fix-store-adress-of-high-64bit-in-helper.patch
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: mipsisa64r6el-gnuabi64-toolchain
@@ -129,10 +129,10 @@ jobs:
     - name: configure
       run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
 
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-ppc64-cpu-gcc.yml b/.github/workflows/linux-ppc64-cpu-gcc.yml
index 88fdccee092..834bfb56952 100644
--- a/.github/workflows/linux-ppc64-cpu-gcc.yml
+++ b/.github/workflows/linux-ppc64-cpu-gcc.yml
@@ -34,7 +34,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-ppc-install-20220502-2
@@ -55,7 +55,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: powerpc-gnu-toolchain
@@ -66,13 +66,13 @@ jobs:
     - name: configure
       run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
 
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-ppc TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-ppc TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc-linux-gnu" ctest --output-on-failure -j $(nproc)
 
   linux-gcc-ppc64le:
     runs-on: ubuntu-20.04
@@ -81,7 +81,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-ppc64le-install-20220502-2
@@ -102,7 +102,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc64le-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: powerpc64le-gnu-toolchain
@@ -113,13 +113,13 @@ jobs:
     - name: configure
       run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc64le-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
 
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j $(nproc)
 
   linux-gcc-power8le-vsx:
     runs-on: ubuntu-20.04
@@ -128,7 +128,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-ppc64le-install-20220502-2
@@ -149,7 +149,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc64le-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: powerpc64le-gnu-toolchain
@@ -160,13 +160,13 @@ jobs:
     - name: configure
       run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/power8le-linux-gnu-vsx.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
 
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power8_v2.0" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power8_v2.0" ctest --output-on-failure -j $(nproc)
   linux-gcc-power9le-vsx:
     runs-on: ubuntu-20.04
     steps:
@@ -174,7 +174,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-ppc64le-install-20220502-2
@@ -195,7 +195,7 @@ jobs:
       run: |
         cd qemu
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=ppc64le-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: powerpc64le-gnu-toolchain
@@ -206,10 +206,10 @@ jobs:
     - name: configure
       run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/power9le-linux-gnu-vsx.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
 
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power9_v2.0" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power9_v2.0" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-riscv64-cpu-gcc.yml b/.github/workflows/linux-riscv64-cpu-gcc.yml
index c0e8ca88b0d..cfd9685b800 100644
--- a/.github/workflows/linux-riscv64-cpu-gcc.yml
+++ b/.github/workflows/linux-riscv64-cpu-gcc.yml
@@ -38,7 +38,7 @@ jobs:
 
     - name: cache-qemu
       id: cache-qemu
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: qemu-install
         key: qemu-riscv64-install-20220502-4
@@ -61,7 +61,7 @@ jobs:
         wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
         patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
         ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
-        make -j2
+        make -j$(nproc)
         make install
 
     - name: riscv64-gnu-toolchain
@@ -72,13 +72,13 @@ jobs:
     - name: configure
       run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
 
     - name: test
       run: |
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j $(nproc)
 
   linux-gcc-riscv64-c906:
     runs-on: [self-hosted, linux, centos]
@@ -106,7 +106,7 @@ jobs:
 
     #- name: cache-qemu
       #id: cache-qemu
-      #uses: actions/cache@v3
+      #uses: actions/cache@v4
       #with:
         #path: qemu-install
         #key: qemu-riscv64-install-20220502-3
@@ -134,7 +134,7 @@ jobs:
 
     #- name: cache-riscv
       #id: cache-riscv
-      #uses: actions/cache@v3
+      #uses: actions/cache@v4
       #with:
         #path: rv64gcv-install-next
         #key: rv64gcv-linux-install-20210504
diff --git a/.github/workflows/linux-x64-cpu-clang-python.yml b/.github/workflows/linux-x64-cpu-clang-python.yml
index 9684fcd2e3a..8e6f6718f2f 100644
--- a/.github/workflows/linux-x64-cpu-clang-python.yml
+++ b/.github/workflows/linux-x64-cpu-clang-python.yml
@@ -51,7 +51,7 @@ jobs:
         CXX: clang++
       run: mkdir build && cd build && cmake -DNCNN_PYTHON=ON -DNCNN_DISABLE_RTTI=OFF -DNCNN_DISABLE_EXCEPTION=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
     - name: build
-      run: cmake --build build -j 2
+      run: cmake --build build -j $(nproc)
     - name: install python
       run: cd python && pip install .
     - name: test
diff --git a/.github/workflows/linux-x64-cpu-clang.yml b/.github/workflows/linux-x64-cpu-clang.yml
index b03c2e5a8e4..185a3642caa 100644
--- a/.github/workflows/linux-x64-cpu-clang.yml
+++ b/.github/workflows/linux-x64-cpu-clang.yml
@@ -50,9 +50,9 @@ jobs:
       run: |
         mkdir build-sse2 && cd build-sse2
         cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-sse2
-      run: cd build-sse2 && ctest --output-on-failure -j 2
+      run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
     - name: build-shared
       env:
         CC: clang
@@ -60,7 +60,7 @@ jobs:
       run: |
         mkdir build-shared && cd build-shared
         cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-avx2
       env:
         CC: clang
@@ -68,9 +68,9 @@ jobs:
       run: |
         mkdir build-avx2 && cd build-avx2
         cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-avx2
-      run: cd build-avx2 && ctest --output-on-failure -j 2
+      run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
     - name: build-avx
       env:
         CC: clang
@@ -78,9 +78,9 @@ jobs:
       run: |
         mkdir build-avx && cd build-avx
         cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-avx
-      run: cd build-avx && ctest --output-on-failure -j 2
+      run: cd build-avx && ctest --output-on-failure -j $(nproc)
     - name: build-avx1-2
       env:
         CC: clang
@@ -88,9 +88,9 @@ jobs:
       run: |
         mkdir build-avx1-2 && cd build-avx1-2
         cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-avx1-2
-      run: cd build-avx1-2 && ctest --output-on-failure -j 2
+      run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
     - name: build-noint8
       env:
         CC: clang
@@ -98,9 +98,9 @@ jobs:
       run: |
         mkdir build-noint8 && cd build-noint8
         cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-noint8
-      run: cd build-noint8 && ctest --output-on-failure -j 2
+      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
 
   linux-clang-simplestl:
     runs-on: ubuntu-latest
@@ -113,9 +113,9 @@ jobs:
       run: |
         mkdir build-simplestl && cd build-simplestl
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-simplestl
-      run: cd build-simplestl && ctest --output-on-failure -j 2
+      run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
     - name: build-simplestl-simpleomp
       env:
         CC: clang
@@ -123,6 +123,6 @@ jobs:
       run: |
         mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-simplestl-simpleomp
-      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j 2
+      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-x64-cpu-gcc-musl.yml b/.github/workflows/linux-x64-cpu-gcc-musl.yml
index d18c9cbc215..cf3d2087d20 100644
--- a/.github/workflows/linux-x64-cpu-gcc-musl.yml
+++ b/.github/workflows/linux-x64-cpu-gcc-musl.yml
@@ -56,12 +56,12 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test
       shell: alpine.sh {0}
-      run: cd build && ctest --output-on-failure -j 2
+      run: cd build && ctest --output-on-failure -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
diff --git a/.github/workflows/linux-x64-cpu-gcc-san.yml b/.github/workflows/linux-x64-cpu-gcc-san.yml
index ad266652929..8a52096461f 100644
--- a/.github/workflows/linux-x64-cpu-gcc-san.yml
+++ b/.github/workflows/linux-x64-cpu-gcc-san.yml
@@ -35,8 +35,8 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_ASAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test
       run: |
         cd build
-        ctest --output-on-failure -j 2
+        ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-x64-cpu-gcc-sde.yml b/.github/workflows/linux-x64-cpu-gcc-sde.yml
index ca0f777a017..eb680173743 100644
--- a/.github/workflows/linux-x64-cpu-gcc-sde.yml
+++ b/.github/workflows/linux-x64-cpu-gcc-sde.yml
@@ -42,7 +42,7 @@ jobs:
     - name: gcc12
       run: sudo apt-get install gcc-12 g++-12
     - name: Setup SDE binaries
-      uses: petarpetrovt/setup-sde@v2.3
+      uses: petarpetrovt/setup-sde@v2.4
     - name: build-avx512-spr
       env:
         CC: gcc-12
@@ -50,8 +50,8 @@ jobs:
       run: |
         mkdir build-avx512-spr && cd build-avx512-spr
         cmake -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-avx512-spr
       run: |
         cd build-avx512-spr
-        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-x64-cpu-gcc.yml b/.github/workflows/linux-x64-cpu-gcc.yml
index 6d1a41a15b9..ab2185be3e7 100644
--- a/.github/workflows/linux-x64-cpu-gcc.yml
+++ b/.github/workflows/linux-x64-cpu-gcc.yml
@@ -47,42 +47,42 @@ jobs:
       run: |
         mkdir build-sse2 && cd build-sse2
         cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-sse2
-      run: cd build-sse2 && ctest --output-on-failure -j 2
+      run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-avx2
       run: |
         mkdir build-avx2 && cd build-avx2
         cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-avx2
-      run: cd build-avx2 && ctest --output-on-failure -j 2
+      run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
     - name: build-avx
       run: |
         mkdir build-avx && cd build-avx
         cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-avx
-      run: cd build-avx && ctest --output-on-failure -j 2
+      run: cd build-avx && ctest --output-on-failure -j $(nproc)
     - name: build-avx1-2
       run: |
         mkdir build-avx1-2 && cd build-avx1-2
         cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-avx1-2
-      run: cd build-avx1-2 && ctest --output-on-failure -j 2
+      run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
     - name: build-noint8
       run: |
         mkdir build-noint8 && cd build-noint8
         cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-noint8
-      run: cd build-noint8 && ctest --output-on-failure -j 2
+      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
 
   linux-gcc-cpp03-nostdio-nostring-simplestl:
     runs-on: ubuntu-20.04
@@ -92,28 +92,28 @@ jobs:
       run: |
         mkdir build-nostdio && cd build-nostdio
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-nostdio
-      run: cd build-nostdio && ctest --output-on-failure -j 2
+      run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
     - name: build-nostdio-nostring
       run: |
         mkdir build-nostdio-nostring && cd build-nostdio-nostring
         cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-simplestl
       run: |
         mkdir build-simplestl && cd build-simplestl
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-simplestl
-      run: cd build-simplestl && ctest --output-on-failure -j 2
+      run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
     - name: build-simplestl-simpleomp
       run: |
         mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-simplestl-simpleomp
-      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j 2
+      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
 
   linux-gcc-avx512:
     runs-on: [self-hosted, linux, t4]
diff --git a/.github/workflows/linux-x64-gpu-clang-python.yml b/.github/workflows/linux-x64-gpu-clang-python.yml
index 12cd441ad20..ea9232bcfc4 100644
--- a/.github/workflows/linux-x64-gpu-clang-python.yml
+++ b/.github/workflows/linux-x64-gpu-clang-python.yml
@@ -40,7 +40,7 @@ jobs:
         submodules: true
     - name: cache-swiftshader
       id: cache-swiftshader
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: swiftshader-install
         key: swiftshader-linux-install-20230420
@@ -62,7 +62,7 @@ jobs:
         cd swiftshader
         mkdir -p build; cd build
         cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         mkdir $GITHUB_WORKSPACE/swiftshader-install
         cp Linux/* $GITHUB_WORKSPACE/swiftshader-install
     - name: set up python ${{ matrix.python-version }}
@@ -80,7 +80,7 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DNCNN_VULKAN=ON -DNCNN_PYTHON=ON -DNCNN_DISABLE_RTTI=OFF -DNCNN_DISABLE_EXCEPTION=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: install python
       run: cd python && pip install .
     - name: test
diff --git a/.github/workflows/linux-x64-gpu-clang.yml b/.github/workflows/linux-x64-gpu-clang.yml
index e5eecc37d7b..8ab7e6ae961 100644
--- a/.github/workflows/linux-x64-gpu-clang.yml
+++ b/.github/workflows/linux-x64-gpu-clang.yml
@@ -43,7 +43,7 @@ jobs:
         submodules: true
     - name: cache-swiftshader
       id: cache-swiftshader
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: swiftshader-install
         key: swiftshader-linux-install-20230420
diff --git a/.github/workflows/linux-x64-gpu-gcc.yml b/.github/workflows/linux-x64-gpu-gcc.yml
index 7d25327eaed..55eb9ff87f2 100644
--- a/.github/workflows/linux-x64-gpu-gcc.yml
+++ b/.github/workflows/linux-x64-gpu-gcc.yml
@@ -43,7 +43,7 @@ jobs:
         submodules: true
     - name: cache-swiftshader
       id: cache-swiftshader
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: swiftshader-install
         key: swiftshader-linux-install-20230420
@@ -98,12 +98,12 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake ..
-        cmake --build . -j 4
+        cmake --build . -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 4
+        cmake --build . -j $(nproc)
 
   linux-gcc-gpu-t4:
     runs-on: [self-hosted, linux, t4]
diff --git a/.github/workflows/linux-x86-cpu-clang.yml b/.github/workflows/linux-x86-cpu-clang.yml
index 52ef4e969b0..2ce454c36f4 100644
--- a/.github/workflows/linux-x86-cpu-clang.yml
+++ b/.github/workflows/linux-x86-cpu-clang.yml
@@ -44,9 +44,9 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test
-      run: cd build && ctest --output-on-failure -j 2
+      run: cd build && ctest --output-on-failure -j $(nproc)
     - name: build-shared
       env:
         CC: clang
@@ -54,7 +54,7 @@ jobs:
       run: |
         mkdir build-shared && cd build-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-noint8
       env:
         CC: clang
@@ -62,6 +62,6 @@ jobs:
       run: |
         mkdir build-noint8 && cd build-noint8
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-noint8
-      run: cd build-noint8 && ctest --output-on-failure -j 2
+      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/linux-x86-cpu-gcc.yml b/.github/workflows/linux-x86-cpu-gcc.yml
index 3dda6701725..1d88eb3a840 100644
--- a/.github/workflows/linux-x86-cpu-gcc.yml
+++ b/.github/workflows/linux-x86-cpu-gcc.yml
@@ -41,25 +41,25 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test
-      run: cd build && ctest --output-on-failure -j 2
+      run: cd build && ctest --output-on-failure -j $(nproc)
     - name: build-nosse
       run: |
         mkdir build-nosse && cd build-nosse
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-nosse
-      run: cd build-nosse && ctest --output-on-failure -j 2
+      run: cd build-nosse && ctest --output-on-failure -j $(nproc)
     - name: build-shared
       run: |
         mkdir build-shared && cd build-shared
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: build-noint8
       run: |
         mkdir build-noint8 && cd build-noint8
         cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-noint8
-      run: cd build-noint8 && ctest --output-on-failure -j 2
+      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/mac-catalyst-arm64-cpu.yml b/.github/workflows/mac-catalyst-arm64-cpu.yml
index 52f002897b6..4a4b5bae9ed 100644
--- a/.github/workflows/mac-catalyst-arm64-cpu.yml
+++ b/.github/workflows/mac-catalyst-arm64-cpu.yml
@@ -38,7 +38,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-mac-catalyst-install-20230504
diff --git a/.github/workflows/mac-catalyst-arm64-gpu.yml b/.github/workflows/mac-catalyst-arm64-gpu.yml
index ff998648772..b1141287176 100644
--- a/.github/workflows/mac-catalyst-arm64-gpu.yml
+++ b/.github/workflows/mac-catalyst-arm64-gpu.yml
@@ -42,7 +42,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-mac-catalyst-install-20230504
diff --git a/.github/workflows/mac-catalyst-x64-cpu.yml b/.github/workflows/mac-catalyst-x64-cpu.yml
index a21bb2ce8af..ce37229fcb3 100644
--- a/.github/workflows/mac-catalyst-x64-cpu.yml
+++ b/.github/workflows/mac-catalyst-x64-cpu.yml
@@ -46,7 +46,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-mac-catalyst-install-20230504
diff --git a/.github/workflows/mac-catalyst-x64-gpu.yml b/.github/workflows/mac-catalyst-x64-gpu.yml
index 13ac747f212..4dabc202a78 100644
--- a/.github/workflows/mac-catalyst-x64-gpu.yml
+++ b/.github/workflows/mac-catalyst-x64-gpu.yml
@@ -50,7 +50,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-mac-catalyst-install-20230504
diff --git a/.github/workflows/macos-arm64-cpu.yml b/.github/workflows/macos-arm64-cpu.yml
index 69d7518566e..09351c2f08d 100644
--- a/.github/workflows/macos-arm64-cpu.yml
+++ b/.github/workflows/macos-arm64-cpu.yml
@@ -39,7 +39,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-macos-install-20230504
diff --git a/.github/workflows/macos-arm64-gpu.yml b/.github/workflows/macos-arm64-gpu.yml
index 3decda70cf8..1dbbe31ec32 100644
--- a/.github/workflows/macos-arm64-gpu.yml
+++ b/.github/workflows/macos-arm64-gpu.yml
@@ -43,7 +43,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-macos-install-20230504
diff --git a/.github/workflows/macos-x64-cpu-python.yml b/.github/workflows/macos-x64-cpu-python.yml
index 9aa7fb2aa58..6d048826064 100644
--- a/.github/workflows/macos-x64-cpu-python.yml
+++ b/.github/workflows/macos-x64-cpu-python.yml
@@ -54,7 +54,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-macos-install-20230504
diff --git a/.github/workflows/macos-x64-cpu.yml b/.github/workflows/macos-x64-cpu.yml
index f7e21b5ad1d..6db56205b70 100644
--- a/.github/workflows/macos-x64-cpu.yml
+++ b/.github/workflows/macos-x64-cpu.yml
@@ -50,7 +50,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-macos-install-20230504
diff --git a/.github/workflows/macos-x64-gpu.yml b/.github/workflows/macos-x64-gpu.yml
index 83ad2dc79fd..bfb2ae5805b 100644
--- a/.github/workflows/macos-x64-gpu.yml
+++ b/.github/workflows/macos-x64-gpu.yml
@@ -53,7 +53,7 @@ jobs:
 
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-macos-install-20230504
@@ -107,7 +107,7 @@ jobs:
 
     - name: cache-swiftshader
       id: cache-swiftshader
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: swiftshader-install
         key: swiftshader-macos-install-20230420
diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml
index 208c3288fe0..38f28e7b1d1 100644
--- a/.github/workflows/release-python.yml
+++ b/.github/workflows/release-python.yml
@@ -37,28 +37,29 @@ jobs:
 
     - uses: actions/upload-artifact@v4
       with:
+        name: sdist
         path: dist/*.tar.gz
 
   build_wheels:
-    name: ${{ matrix.arch }} ${{ matrix.build }} on ${{ matrix.os }}
+    name: ${{ matrix.arch }} ${{ matrix.build_id }} on ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
         include:
-          - { os: ubuntu-20.04, arch: x86_64,     build: 'cp*-manylinux*' }
-          - { os: ubuntu-20.04, arch: x86_64,     build: 'cp*-musllinux*' }
-          - { os: ubuntu-20.04, arch: x86_64,     build: 'pp*'            }
-          - { os: ubuntu-20.04, arch: i686,       build: 'cp*-manylinux*' }
-          - { os: ubuntu-20.04, arch: i686,       build: 'cp*-musllinux*' }
-          - { os: ubuntu-20.04, arch: i686,       build: 'pp*'            }
-          - { os: windows-2019, arch: x86,        build: 'cp*'            }
-          - { os: windows-2019, arch: AMD64,      build: 'cp*'            }
-          - { os: windows-2019, arch: AMD64,      build: 'pp*'            }
-          - { os: windows-2019, arch: ARM64,      build: 'cp*'            }
-          - { os: macos-latest, arch: x86_64,     build: 'cp*'            }
-          - { os: macos-latest, arch: x86_64,     build: 'pp*'            }
-          - { os: macos-latest, arch: arm64,      build: 'cp*'            }
+          - { os: ubuntu-20.04, arch: x86_64,     build: 'cp*-manylinux*', build_id: cp-manylinux }
+          - { os: ubuntu-20.04, arch: x86_64,     build: 'cp*-musllinux*', build_id: cp-musllinux }
+          - { os: ubuntu-20.04, arch: x86_64,     build: 'pp*',            build_id: pp           }
+          - { os: ubuntu-20.04, arch: i686,       build: 'cp*-manylinux*', build_id: cp-manylinux }
+          - { os: ubuntu-20.04, arch: i686,       build: 'cp*-musllinux*', build_id: cp-musllinux }
+          - { os: ubuntu-20.04, arch: i686,       build: 'pp*',            build_id: pp           }
+          - { os: windows-2019, arch: x86,        build: 'cp*',            build_id: cp           }
+          - { os: windows-2019, arch: AMD64,      build: 'cp*',            build_id: cp           }
+          - { os: windows-2019, arch: AMD64,      build: 'pp*',            build_id: pp           }
+          - { os: windows-2019, arch: ARM64,      build: 'cp*',            build_id: cp           }
+          - { os: macos-latest, arch: x86_64,     build: 'cp*',            build_id: cp           }
+          - { os: macos-latest, arch: x86_64,     build: 'pp*',            build_id: pp           }
+          - { os: macos-latest, arch: arm64,      build: 'cp*',            build_id: cp           }
 
     steps:
     - uses: actions/checkout@v4
@@ -72,24 +73,24 @@ jobs:
     # build wheels for ubuntu-20.04
     - name: Build wheels for ubuntu
       if: matrix.os == 'ubuntu-20.04'
-      uses: pypa/cibuildwheel@v2.16.2
+      uses: pypa/cibuildwheel@v2.17.0
       env:
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
         CIBW_BUILD_VERBOSITY: 1
-        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2
+        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
       with:
         output-dir: wheelhouse
 
     # build wheels for windows-2019
     - name: Build wheels for windows
       if: matrix.os == 'windows-2019' && (matrix.arch == 'AMD64' || matrix.arch == 'x86')
-      uses: pypa/cibuildwheel@v2.16.2
+      uses: pypa/cibuildwheel@v2.17.0
       env:
         CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
         CIBW_BUILD_VERBOSITY: 1
-        CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=2
+        CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=4
         CIBW_BEFORE_BUILD: pip install delvewheel
         CIBW_REPAIR_WHEEL_COMMAND: delvewheel repair -w {dest_dir} {wheel}
       with:
@@ -97,12 +98,12 @@ jobs:
 
     - name: Build wheels for windows ARM64
       if: matrix.os == 'windows-2019' && matrix.arch == 'ARM64'
-      uses: pypa/cibuildwheel@v2.16.2
+      uses: pypa/cibuildwheel@v2.17.0
       env:
         CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
         CIBW_BUILD_VERBOSITY: 1
-        CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=2
+        CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=4
         CIBW_BEFORE_BUILD: pip install delvewheel
         CIBW_REPAIR_WHEEL_COMMAND: delvewheel repair -w {dest_dir} {wheel} --no-dll "msvcp140.dll;vcomp140.dll"
       with:
@@ -112,7 +113,7 @@ jobs:
     - name: cache-openmp for macos
       if: matrix.os == 'macos-latest'
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-macos-install-20230504
@@ -178,12 +179,12 @@ jobs:
 
     - name: Build wheels for macos x86_64
       if: matrix.os == 'macos-latest' && matrix.arch == 'x86_64'
-      uses: pypa/cibuildwheel@v2.16.2
+      uses: pypa/cibuildwheel@v2.17.0
       env:
         CIBW_ARCHS_MACOS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
         CIBW_BUILD_VERBOSITY: 1
-        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2
+        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3
           CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC ARCHS="x86_64"
           DEPLOYMENT_TARGET="10.9" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
           OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp"
@@ -195,12 +196,12 @@ jobs:
 
     - name: Build wheels for macos arm64
       if: matrix.os == 'macos-latest' && matrix.arch == 'arm64'
-      uses: pypa/cibuildwheel@v2.16.2
+      uses: pypa/cibuildwheel@v2.17.0
       env:
         CIBW_ARCHS_MACOS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
         CIBW_BUILD_VERBOSITY: 1
-        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2
+        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3
           CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC_ARM64 ARCHS="arm64"
           DEPLOYMENT_TARGET="11.0" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
           OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp"
@@ -221,30 +222,19 @@ jobs:
     - name: Upload wheels
       uses: actions/upload-artifact@v4
       with:
+        name: wheels-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.build_id }}
         path: wheelhouse/*.whl
 
-  build_wheels_qemu:
-    name: ${{ matrix.arch }} ${{ matrix.build }}
+  build_wheels_qemu_cp:
+    name: ${{ matrix.arch }} ${{ matrix.build_cp }} ${{ matrix.build_sub }}
     runs-on: ubuntu-20.04
 
     strategy:
       fail-fast: false
       matrix:
         arch: [aarch64, ppc64le, s390x]
-        build: [ 'cp36-manylinux*',  'cp37-manylinux*',  'cp38-manylinux*',
-                 'cp39-manylinux*',  'cp310-manylinux*', 'cp311-manylinux*',
-                 'cp312-manylinux*', 'cp36-musllinux*',  'cp37-musllinux*',
-                 'cp38-musllinux*',  'cp39-musllinux*',  'cp310-musllinux*',
-                 'cp311-musllinux*', 'cp312-musllinux*' ]
-        include:
-          - arch: aarch64
-            build: 'pp37-*'
-          - arch: aarch64
-            build: 'pp38-*'
-          - arch: aarch64
-            build: 'pp39-*'
-          - arch: aarch64
-            build: 'pp310-*'
+        build_cp: [cp36, cp37, cp38, cp39, cp310, cp311, cp312]
+        build_sub: [manylinux, musllinux]
 
     steps:
     - uses: actions/checkout@v4
@@ -261,12 +251,60 @@ jobs:
         platforms: all
 
     - name: Build wheels for manylinux with qemu
-      uses: pypa/cibuildwheel@v2.16.2
+      uses: pypa/cibuildwheel@v2.17.0
       env:
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
-        CIBW_BUILD: ${{ matrix.build }}
+        CIBW_BUILD: ${{ matrix.build_cp }}-${{ matrix.build_sub }}*
+        CIBW_BUILD_VERBOSITY: 1
+        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
+      with:
+        output-dir: wheelhouse
+
+    - name: Show files
+      run: ls -lh wheelhouse
+      shell: bash
+
+    - name: Verify clean directory
+      run: git diff --exit-code
+      shell: bash
+
+    - name: Upload wheels
+      uses: actions/upload-artifact@v4
+      with:
+        name: wheels_qemu_cp-${{ matrix.arch }}-${{ matrix.build_cp }}-${{ matrix.build_sub }}
+        path: wheelhouse/*.whl
+
+  build_wheels_qemu_pp:
+    name: ${{ matrix.arch }} ${{ matrix.build_pp }}
+    runs-on: ubuntu-20.04
+
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: [aarch64]
+        build_pp: [pp37, pp38, pp39, pp310]
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.x'
+
+    - name: Set up QEMU
+      uses: docker/setup-qemu-action@v3
+      with:
+        platforms: all
+
+    - name: Build wheels for manylinux with qemu
+      uses: pypa/cibuildwheel@v2.17.0
+      env:
+        CIBW_ARCHS_LINUX: ${{ matrix.arch }}
+        CIBW_BUILD: ${{ matrix.build_pp }}-*
         CIBW_BUILD_VERBOSITY: 1
-        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2
+        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
       with:
         output-dir: wheelhouse
 
@@ -281,13 +319,14 @@ jobs:
     - name: Upload wheels
       uses: actions/upload-artifact@v4
       with:
+        name: wheels_qemu_pp-${{ matrix.arch }}-${{ matrix.build_pp }}
         path: wheelhouse/*.whl
 
   upload_all:
     permissions:
       contents: none
     name: Upload
-    needs: [build_wheels, build_wheels_qemu, build_sdist]
+    needs: [build_wheels, build_wheels_qemu_cp, build_wheels_qemu_pp, build_sdist]
     runs-on: ubuntu-latest
 
     steps:
@@ -297,8 +336,8 @@ jobs:
 
     - uses: actions/download-artifact@v4
       with:
-        name: artifact
         path: dist
+        merge-multiple: true
 
     - uses: pypa/gh-action-pypi-publish@release/v1
       with:
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index f99d47c711c..5c355f41145 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -102,7 +102,7 @@ jobs:
         mkdir build && cd build
         cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
@@ -134,7 +134,7 @@ jobs:
         mkdir build && cd build
         cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
@@ -166,7 +166,7 @@ jobs:
         mkdir build && cd build
         cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
@@ -186,7 +186,7 @@ jobs:
     steps:
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-macos-release-11.0.0-20230504
@@ -407,7 +407,7 @@ jobs:
     steps:
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-ios-release-11.0.0-20230504
@@ -677,7 +677,7 @@ jobs:
     steps:
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-ios-bitcode-release-11.0.0-20230504
@@ -947,7 +947,7 @@ jobs:
     steps:
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-ios-simulator-release-11.0.0-20230504
@@ -1217,7 +1217,7 @@ jobs:
     steps:
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-ios-simulator-bitcode-release-11.0.0-20230504
@@ -1487,7 +1487,7 @@ jobs:
     steps:
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-mac-catalyst-release-11.0.0-20230504
@@ -1731,7 +1731,7 @@ jobs:
     steps:
     - name: cache-openmp
       id: cache-openmp
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: openmp-install
         key: openmp-mac-catalyst-bitcode-release-11.0.0-20230504
@@ -2185,7 +2185,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
             -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-aarch64
       run: |
@@ -2193,7 +2193,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
             -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86
       run: |
@@ -2201,7 +2201,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
             -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86_64
       run: |
@@ -2209,7 +2209,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
             -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
@@ -2242,7 +2242,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
             -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-aarch64
       run: |
@@ -2250,7 +2250,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
             -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86
       run: |
@@ -2258,7 +2258,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
             -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86_64
       run: |
@@ -2266,7 +2266,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
             -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
@@ -2299,9 +2299,9 @@ jobs:
       run: |
         mkdir build-armv7 && cd build-armv7
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 \
+            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-aarch64
       run: |
@@ -2309,15 +2309,15 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86
       run: |
         mkdir build-x86 && cd build-x86
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-21 \
+            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86_64
       run: |
@@ -2325,7 +2325,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
@@ -2358,9 +2358,9 @@ jobs:
       run: |
         mkdir build-armv7 && cd build-armv7
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 \
+            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-aarch64
       run: |
@@ -2368,15 +2368,15 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86
       run: |
         mkdir build-x86 && cd build-x86
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-21 \
+            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86_64
       run: |
@@ -2384,7 +2384,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
             -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
@@ -2422,7 +2422,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
             -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-simd
       run: |
@@ -2431,7 +2431,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
             -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-threads
       run: |
@@ -2440,7 +2440,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
             -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-simd-threads
       run: |
@@ -2449,7 +2449,7 @@ jobs:
         cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
             -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
             -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
@@ -2479,7 +2479,7 @@ jobs:
         submodules: true
     - name: cache-protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: "protobuf-install"
         key: protobuf-vs2015-x86-x64-install
@@ -2491,24 +2491,24 @@ jobs:
         cd protobuf-3.11.2
         mkdir build-x86; cd build-x86;
         cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
         cd ..
         mkdir build-x64; cd build-x64;
         cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x86
       run: |
         mkdir build-x86; cd build-x86
         cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x64
       run: |
         mkdir build-x64; cd build-x64
         cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: package
       run: |
@@ -2536,7 +2536,7 @@ jobs:
         submodules: true
     - name: cache-protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: "protobuf-install"
         key: protobuf-vs2015-x86-x64-install
@@ -2548,24 +2548,24 @@ jobs:
         cd protobuf-3.11.2
         mkdir build-x86; cd build-x86;
         cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
         cd ..
         mkdir build-x64; cd build-x64;
         cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x86
       run: |
         mkdir build-x86; cd build-x86
         cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x64
       run: |
         mkdir build-x64; cd build-x64
         cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: package
       run: |
@@ -2593,7 +2593,7 @@ jobs:
         submodules: true
     - name: cache-protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: "protobuf-install"
         key: protobuf-vs2017-x86-x64-install
@@ -2605,24 +2605,24 @@ jobs:
         cd protobuf-3.11.2
         mkdir build-x86; cd build-x86;
         cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
         cd ..
         mkdir build-x64; cd build-x64;
         cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x86
       run: |
         mkdir build-x86; cd build-x86
         cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x64
       run: |
         mkdir build-x64; cd build-x64
         cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: package
       run: |
@@ -2650,7 +2650,7 @@ jobs:
         submodules: true
     - name: cache-protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: "protobuf-install"
         key: protobuf-vs2017-x86-x64-install
@@ -2662,24 +2662,24 @@ jobs:
         cd protobuf-3.11.2
         mkdir build-x86; cd build-x86;
         cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
         cd ..
         mkdir build-x64; cd build-x64;
         cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x86
       run: |
         mkdir build-x86; cd build-x86
         cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x64
       run: |
         mkdir build-x64; cd build-x64
         cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: package
       run: |
@@ -2707,7 +2707,7 @@ jobs:
         submodules: true
     - name: cache-protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: "protobuf-install"
         key: protobuf-vs2019-x86-x64-install
@@ -2719,36 +2719,36 @@ jobs:
         cd protobuf-3.11.2
         mkdir build-x86; cd build-x86;
         cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
         cd ..
         mkdir build-x64; cd build-x64;
         cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x86
       run: |
         mkdir build-x86; cd build-x86
         cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x64
       run: |
         mkdir build-x64; cd build-x64
         cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-arm
       run: |
         mkdir build-arm; cd build-arm
         cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-arm64
       run: |
         mkdir build-arm64; cd build-arm64
         cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: package
       run: |
@@ -2780,7 +2780,7 @@ jobs:
         submodules: true
     - name: cache-protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: "protobuf-install"
         key: protobuf-vs2019-x86-x64-install
@@ -2792,36 +2792,36 @@ jobs:
         cd protobuf-3.11.2
         mkdir build-x86; cd build-x86;
         cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
         cd ..
         mkdir build-x64; cd build-x64;
         cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x86
       run: |
         mkdir build-x86; cd build-x86
         cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x64
       run: |
         mkdir build-x64; cd build-x64
         cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-arm
       run: |
         mkdir build-arm; cd build-arm
         cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-arm64
       run: |
         mkdir build-arm64; cd build-arm64
         cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: package
       run: |
@@ -2853,7 +2853,7 @@ jobs:
         submodules: true
     - name: cache-protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: "protobuf-install"
         key: protobuf-vs2022-x86-x64-install
@@ -2865,36 +2865,36 @@ jobs:
         cd protobuf-3.11.2
         mkdir build-x86; cd build-x86;
         cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
         cd ..
         mkdir build-x64; cd build-x64;
         cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x86
       run: |
         mkdir build-x86; cd build-x86
         cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x64
       run: |
         mkdir build-x64; cd build-x64
         cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-arm
       run: |
         mkdir build-arm; cd build-arm
         cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-arm64
       run: |
         mkdir build-arm64; cd build-arm64
         cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: package
       run: |
@@ -2926,7 +2926,7 @@ jobs:
         submodules: true
     - name: cache-protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: "protobuf-install"
         key: protobuf-vs2022-x86-x64-install
@@ -2938,36 +2938,36 @@ jobs:
         cd protobuf-3.11.2
         mkdir build-x86; cd build-x86;
         cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
         cd ..
         mkdir build-x64; cd build-x64;
         cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x86
       run: |
         mkdir build-x86; cd build-x86
         cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-x64
       run: |
         mkdir build-x64; cd build-x64
         cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-arm
       run: |
         mkdir build-arm; cd build-arm
         cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-arm64
       run: |
         mkdir build-arm64; cd build-arm64
         cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: package
       run: |
@@ -3197,7 +3197,7 @@ jobs:
         path: artifacts
 
     - name: create-release
-      uses: softprops/action-gh-release@v1
+      uses: softprops/action-gh-release@v2
       with:
         token: ${{ secrets.GITHUB_TOKEN }}
         tag_name: ${{ needs.setup.outputs.VERSION }}
diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml
index 7aa9c58d27e..83e6328bb22 100644
--- a/.github/workflows/test-coverage.yml
+++ b/.github/workflows/test-coverage.yml
@@ -119,7 +119,7 @@ jobs:
     - name: lcov
       run: sudo apt-get install lcov
     - name: Setup SDE binaries
-      uses: petarpetrovt/setup-sde@v2.3
+      uses: petarpetrovt/setup-sde@v2.4
     - name: build-avx512-spr
       env:
         CC: gcc-12
diff --git a/.github/workflows/web-assembly.yml b/.github/workflows/web-assembly.yml
index f997f9dc2a3..1b5e8915a86 100644
--- a/.github/workflows/web-assembly.yml
+++ b/.github/workflows/web-assembly.yml
@@ -47,30 +47,30 @@ jobs:
         export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1"
         mkdir build-basic && cd build-basic
         cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-basic
       run: |
         cd build-basic
-        TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j $(nproc)
     - name: build-simd
       run: |
         source emsdk/emsdk_env.sh
         export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1"
         mkdir build-simd && cd build-simd
         cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-simd
       run: |
         cd build-simd
-        TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd" ctest --output-on-failure -j $(nproc)
     - name: build-simd-omp
       run: |
         source emsdk/emsdk_env.sh
         export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1"
         mkdir build-simd-omp && cd build-simd-omp
         cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . -j 2
+        cmake --build . -j $(nproc)
     - name: test-simd-omp
       run: |
         cd build-simd-omp
-        TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd;--experimental-wasm-threads" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=node TESTS_EXECUTABLE_LOADER_ARGUMENTS="--experimental-wasm-simd;--experimental-wasm-threads" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/windows-arm-cpu.yml b/.github/workflows/windows-arm-cpu.yml
index 301715b833f..6cf29356c07 100644
--- a/.github/workflows/windows-arm-cpu.yml
+++ b/.github/workflows/windows-arm-cpu.yml
@@ -49,9 +49,9 @@ jobs:
       run: |
         mkdir build; cd build
         cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: build-shared
       run: |
         mkdir build-shared; cd build-shared
         cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
diff --git a/.github/workflows/windows-arm-gpu.yml b/.github/workflows/windows-arm-gpu.yml
index 70db051ac56..787ffdbdc76 100644
--- a/.github/workflows/windows-arm-gpu.yml
+++ b/.github/workflows/windows-arm-gpu.yml
@@ -51,9 +51,9 @@ jobs:
       run: |
         mkdir build; cd build
         cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: build-shared
       run: |
         mkdir build-shared; cd build-shared
         cmake -T ${{ matrix.toolset-version }},host=x64 -A arm -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
diff --git a/.github/workflows/windows-arm64-cpu.yml b/.github/workflows/windows-arm64-cpu.yml
index 1fded4ac622..7032385ead0 100644
--- a/.github/workflows/windows-arm64-cpu.yml
+++ b/.github/workflows/windows-arm64-cpu.yml
@@ -49,12 +49,12 @@ jobs:
       run: |
         mkdir build; cd build
         cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: build-shared
       run: |
         mkdir build-shared; cd build-shared
         cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
 
   woa-linux:
     name: woa-linux
@@ -82,8 +82,8 @@ jobs:
         export PATH=/msvc/bin/arm64:$PATH
         mkdir build && cd build
         cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_SYSTEM_NAME=Windows -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j $(nproc)
     - name: test
       run: |
         cd build
-        TESTS_EXECUTABLE_LOADER=wine-arm64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="" ctest --output-on-failure -j 2
+        TESTS_EXECUTABLE_LOADER=wine-arm64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="" ctest --output-on-failure -j $(nproc)
diff --git a/.github/workflows/windows-arm64-gpu.yml b/.github/workflows/windows-arm64-gpu.yml
index cb5f9fad430..fa1b8994b2c 100644
--- a/.github/workflows/windows-arm64-gpu.yml
+++ b/.github/workflows/windows-arm64-gpu.yml
@@ -51,9 +51,9 @@ jobs:
       run: |
         mkdir build; cd build
         cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: build-shared
       run: |
         mkdir build-shared; cd build-shared
         cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64 -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
diff --git a/.github/workflows/windows-x64-cpu-vs2019-python.yml b/.github/workflows/windows-x64-cpu-vs2019-python.yml
index e1f956a4688..3df91ab878b 100644
--- a/.github/workflows/windows-x64-cpu-vs2019-python.yml
+++ b/.github/workflows/windows-x64-cpu-vs2019-python.yml
@@ -50,7 +50,7 @@ jobs:
       run: |
         mkdir build; cd build
         cmake -T v142,host=x64 -A x64 -DNCNN_PYTHON=ON -DNCNN_DISABLE_RTTI=OFF -DNCNN_DISABLE_EXCEPTION=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: install python
       run: cd python && pip install .
     - name: test
diff --git a/.github/workflows/windows-x64-cpu.yml b/.github/workflows/windows-x64-cpu.yml
index 604d31406ca..67def785c7c 100644
--- a/.github/workflows/windows-x64-cpu.yml
+++ b/.github/workflows/windows-x64-cpu.yml
@@ -61,7 +61,7 @@ jobs:
     - uses: actions/checkout@v4
     - name: cache-protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: "protobuf-install"
         key: protobuf-${{ matrix.vs-version }}-x64-install-2
@@ -72,31 +72,31 @@ jobs:
         7z x ./protobuf-3.11.2.zip
         cd protobuf-3.11.2
         mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}; cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DNCNN_BUILD_TESTS=ON ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: build-sse2
       run: |
         mkdir build-sse2; cd build-sse2
         cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: test-sse2
-      run: cd build-sse2; ctest -C Release --output-on-failure -j 2
+      run: cd build-sse2; ctest -C Release --output-on-failure -j 4
     - name: build-shared
       run: |
         mkdir build-shared; cd build-shared
         cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_RUNTIME_CPU=ON -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: build-avx2
       run: |
         mkdir build-avx2; cd build-avx2
         cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: test-avx2
-      run: cd build-avx2; ctest -C Release --output-on-failure -j 2
+      run: cd build-avx2; ctest -C Release --output-on-failure -j 4
     - name: build-avx
       run: |
         mkdir build-avx; cd build-avx
         cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: test-avx
-      run: cd build-avx; ctest -C Release --output-on-failure -j 2
+      run: cd build-avx; ctest -C Release --output-on-failure -j 4
diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml
index 1272b7ed920..57be84a602f 100644
--- a/.github/workflows/windows-x64-gpu.yml
+++ b/.github/workflows/windows-x64-gpu.yml
@@ -65,7 +65,7 @@ jobs:
         submodules: true
     - name: cache-protobuf
       id: cache-protobuf
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: "protobuf-install"
         key: protobuf-${{ matrix.vs-version }}-x64-install-2
@@ -76,12 +76,12 @@ jobs:
         7z x ./protobuf-3.11.2.zip
         cd protobuf-3.11.2
         mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}; cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DNCNN_BUILD_TESTS=ON ../cmake
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         cmake --build . --config Release --target install
     - name: cache-swiftshader
       if: matrix.vs-version != 'vs2015'
       id: cache-swiftshader
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: swiftshader-install
         key: swiftshader-${{ matrix.vs-version }}-x64-install-20230420
@@ -103,22 +103,22 @@ jobs:
         cd swiftshader
         mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}
         cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
         mkdir "$env:GITHUB_WORKSPACE/swiftshader-install"
         Copy-Item -Path "Windows\*" -Destination "$env:GITHUB_WORKSPACE\swiftshader-install"
     - name: build
       run: |
         mkdir build; cd build
         cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: test
       if: matrix.vs-version != 'vs2015'
       run: |
         echo "[Processor]`nThreadCount=1`n" > build/tests/Release/SwiftShader.ini
         Copy-Item -Path "$env:GITHUB_WORKSPACE\swiftshader-install\vulkan-1.dll" -Destination 'build\tests'
-        cd build; ctest -C Release --output-on-failure -j 2
+        cd build; ctest -C Release --output-on-failure -j 4
     - name: build-shared
       run: |
         mkdir build-shared; cd build-shared
         cmake -T ${{ matrix.toolset-version }},host=x64 -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
diff --git a/.github/workflows/windows-x86-cpu.yml b/.github/workflows/windows-x86-cpu.yml
index 26d9aaf8b72..68f09157627 100644
--- a/.github/workflows/windows-x86-cpu.yml
+++ b/.github/workflows/windows-x86-cpu.yml
@@ -57,11 +57,11 @@ jobs:
       run: |
         mkdir build; cd build
         cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: test
-      run: cd build; ctest -C Release --output-on-failure -j 2
+      run: cd build; ctest -C Release --output-on-failure -j 4
     - name: build-shared
       run: |
         mkdir build-shared; cd build-shared
         cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
diff --git a/.github/workflows/windows-x86-gpu.yml b/.github/workflows/windows-x86-gpu.yml
index 4161025f481..4f84665b479 100644
--- a/.github/workflows/windows-x86-gpu.yml
+++ b/.github/workflows/windows-x86-gpu.yml
@@ -59,9 +59,9 @@ jobs:
       run: |
         mkdir build; cd build
         cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
     - name: build-shared
       run: |
         mkdir build-shared; cd build-shared
         cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32 -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 2
+        cmake --build . --config Release -j 4
diff --git a/.gitignore b/.gitignore
index 2c71aee0332..cd69c526f19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,10 +54,9 @@ __pycache__
 *.pyd
 *.egg-info/
 python/setup.py
-tools/pnnx/python/setup.py
 
 # Clangd
 .cache/
 
 # Xmake
-.xmake/
\ No newline at end of file
+.xmake/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ffd677bb33..785e2cd3926 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,11 +97,6 @@ else()
 endif()
 
 if(NCNN_SHARED_LIB)
-    if(NCNN_BUILD_TESTS)
-        message(WARNING "NCNN_SHARED_LIB must be OFF to build tests! NCNN_BUILD_TESTS will be turned off.")
-        set(NCNN_BUILD_TESTS OFF)
-    endif()
-
     if(NCNN_ENABLE_LTO)
         # enable global link time optimization
         cmake_policy(SET CMP0069 NEW)
diff --git a/README.md b/README.md
index 3f1904d8f15..a4b2876a5e2 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-![NCNN](https://raw.githubusercontent.com/Tencent/ncnn/master/images/256-ncnn.png)
+![ncnn](https://raw.githubusercontent.com/Tencent/ncnn/master/images/256-ncnn.png)
 
 # ncnn
 
@@ -6,12 +6,12 @@
 [![Download Total Count](https://img.shields.io/github/downloads/Tencent/ncnn/total.svg?style=for-the-badge)](https://github.com/Tencent/ncnn/releases)
 [![codecov](https://img.shields.io/codecov/c/github/Tencent/ncnn/master?style=for-the-badge)](https://codecov.io/gh/Tencent/ncnn)
 
-ncnn is a high-performance neural network inference computing framework optimized for mobile platforms.
+ncnn is a high-performance neural network inference computing framework optimized for mobile platforms. 
 ncnn is deeply considerate about deployment and uses on mobile phones from the beginning of design.
-ncnn does not have third party dependencies. It is cross-platform, and runs faster than all known open source frameworks on mobile phone cpu.
-Developers can easily deploy deep learning algorithm models to the mobile platform by using efficient ncnn implementation,
-create intelligent APPs, and bring the artificial intelligence to your fingertips.
-ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu and so on.
+ncnn does not have third-party dependencies.
+It is cross-platform and runs faster than all known open-source frameworks on mobile phone cpu.
+Developers can easily deploy deep learning algorithm models to the mobile platform by using efficient ncnn implementation, creating intelligent APPs, and bringing artificial intelligence to your fingertips. 
+ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu, and so on.
 
 ncnn 是一个为手机端极致优化的高性能神经网络前向计算框架。
 ncnn 从设计之初深刻考虑手机端的部署和使用。
@@ -29,12 +29,12 @@ ncnn 目前已在腾讯多款应用中使用，如：QQ，Qzone，微信，天
 637093648 (超多大佬)<br />
 答案：卷卷卷卷卷（已满）
 </td>
-<td rowspan=2>
+<td rowspan=3>
 <b>Telegram Group</b>
 
 <https://t.me/ncnnyes>
 </td>
-<td rowspan=2>
+<td rowspan=3>
 <b>Discord Channel</b>
 
 <https://discord.gg/YRsxgmF>
@@ -47,6 +47,12 @@ ncnn 目前已在腾讯多款应用中使用，如：QQ，Qzone，微信，天
 答案：multi-level intermediate representation
 </td>
 </tr>
+<tr>
+<td>
+<b>他们都不知道 pnnx 有多好用群</b><br />
+818998520 (新群！)
+</td>
+</tr>
 </table>
 
 ---
@@ -71,7 +77,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Source</td>
 <td colspan=2>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-full-source.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-full-source.zip)
 
 </td>
 </tr>
@@ -91,8 +97,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Android</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-android-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-android-vulkan-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-android-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-android-vulkan-shared.zip)
 
 </td>
 <td rowspan=2>
@@ -105,8 +111,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Android cpuonly</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-android.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-android-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-android.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-android-shared.zip)
 
 </td>
 </tr>
@@ -125,8 +131,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>iOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-vulkan-bitcode.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-vulkan-bitcode.zip)
 
 </td>
 <td rowspan=2>
@@ -139,8 +145,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>iOS cpuonly</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios.zip)
-  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-bitcode.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios.zip)
+  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-bitcode.zip)
 
 </td>
 </tr>
@@ -148,8 +154,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>iOS-Simulator</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-simulator-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-simulator-vulkan-bitcode.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-simulator-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-simulator-vulkan-bitcode.zip)
 
 </td>
 <td rowspan=2>
@@ -162,14 +168,14 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>iOS-Simulator cpuonly</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-simulator.zip)
-  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ios-simulator-bitcode.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-simulator.zip)
+  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ios-simulator-bitcode.zip)
 
 </td>
 </tr>
 
 <tr>
-<td rowspan=7>
+<td rowspan=11>
   <img src="https://user-images.githubusercontent.com/25181517/186884152-ae609cca-8cf1-4175-8d60-1ce1fa078ca2.png" width="120" height="auto">
 </td>
 <td colspan=3>
@@ -182,7 +188,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>macOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-macos-vulkan.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-macos-vulkan.zip)
 
 </td>
 <td rowspan=2>
@@ -195,7 +201,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>macOS cpuonly</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-macos.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-macos.zip)
 
 </td>
 </tr>
@@ -203,8 +209,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Mac-Catalyst</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-mac-catalyst-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-mac-catalyst-vulkan-bitcode.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-mac-catalyst-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-mac-catalyst-vulkan-bitcode.zip)
 
 </td>
 <td rowspan=2>
@@ -217,8 +223,50 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Mac-Catalyst cpuonly</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-mac-catalyst.zip)
-  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-mac-catalyst-bitcode.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-mac-catalyst.zip)
+  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-mac-catalyst-bitcode.zip)
+
+</td>
+</tr>
+<tr>
+<td>watchOS</td>
+<td>
+
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-watchos.zip)
+
+</td>
+<td rowspan=2>
+
+  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/watchos-cpu.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Awatchos-cpu)
+
+</td>
+</tr>
+<tr>
+<td>watchOS-Simulator</td>
+<td>
+
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-watchos-simulator.zip)
+
+</td>
+</tr>
+<tr>
+<td>tvOS</td>
+<td>
+
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-tvos.zip)
+
+</td>
+<td rowspan=2>
+
+  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/tvos-cpu.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Atvos-cpu)
+
+</td>
+</tr>
+<tr>
+<td>tvOS-Simulator</td>
+<td>
+
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-tvos-simulator.zip)
 
 </td>
 </tr>
@@ -226,8 +274,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Apple xcframework</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-apple-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-apple-vulkan-bitcode.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-apple-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-apple-vulkan-bitcode.zip)
 
 </td>
 <td rowspan=2>
@@ -238,8 +286,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Apple xcframework cpuonly</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-apple.zip)
-  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-apple-bitcode.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-apple.zip)
+  [<img src="https://img.shields.io/badge/+bitcode-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-apple-bitcode.zip)
 
 </td>
 </tr>
@@ -258,8 +306,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Ubuntu 20.04</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ubuntu-2004.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ubuntu-2004-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ubuntu-2004.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ubuntu-2004-shared.zip)
 
 </td>
 <td rowspan=2>
@@ -272,8 +320,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Ubuntu 22.04</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ubuntu-2204.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-ubuntu-2204-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ubuntu-2204.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-ubuntu-2204-shared.zip)
 
 </td>
 </tr>
@@ -292,8 +340,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2015</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2015.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2015-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2015.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2015-shared.zip)
 
 </td>
 <td rowspan=4>
@@ -306,8 +354,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2017</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2017.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2017-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2017.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2017-shared.zip)
 
 </td>
 </tr>
@@ -315,8 +363,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2019</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2019.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2019-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2019.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2019-shared.zip)
 
 </td>
 </tr>
@@ -324,8 +372,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2022</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2022.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-windows-vs2022-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2022.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-windows-vs2022-shared.zip)
 
 </td>
 </tr>
@@ -344,7 +392,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>WebAssembly</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20231027-webassembly.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240102-webassembly.zip)
 
 </td>
 <td>
diff --git a/benchmark/README.md b/benchmark/README.md
index d7cae38a242..6cb198e8973 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -1928,6 +1928,50 @@ cooling_down = 1
       yolo-fastestv2  min =  316.93  max =  319.86  avg =  318.33
 ```
 
+### Radxa Zero 3W, Cortex-A55 (ARMv82) (1.416 GHz x 4)
+```
+loop_count = 10
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =   34.51  max =  106.19  avg =   79.43
+     squeezenet_int8  min =   31.48  max =   49.87  avg =   34.65
+           mobilenet  min =   42.23  max =   45.36  avg =   42.89
+      mobilenet_int8  min =   35.97  max =   53.84  avg =   38.77
+        mobilenet_v2  min =   39.61  max =   40.35  avg =   40.00
+        mobilenet_v3  min =   31.19  max =   31.85  avg =   31.50
+          shufflenet  min =   24.75  max =   27.74  avg =   25.55
+       shufflenet_v2  min =   22.00  max =   22.70  avg =   22.31
+             mnasnet  min =   34.95  max =   53.55  avg =   37.39
+     proxylessnasnet  min =   39.96  max =   44.32  avg =   40.81
+     efficientnet_b0  min =   49.76  max =   67.77  avg =   52.61
+   efficientnetv2_b0  min =   64.00  max =   85.78  avg =   67.06
+        regnety_400m  min =   55.23  max =   73.22  avg =   57.87
+           blazeface  min =    7.80  max =   10.39  avg =    8.27
+           googlenet  min =   98.24  max =  118.27  avg =  101.78
+      googlenet_int8  min =   98.81  max =  115.66  avg =  101.52
+            resnet18  min =   75.33  max =   88.59  avg =   78.19
+       resnet18_int8  min =   76.31  max =   95.17  avg =   79.03
+             alexnet  min =   65.07  max =   73.80  avg =   67.18
+               vgg16  min =  423.20  max =  455.15  avg =  436.32
+          vgg16_int8  min =  591.82  max =  620.22  avg =  607.55
+            resnet50  min =  185.53  max =  207.10  avg =  193.03
+       resnet50_int8  min =  176.84  max =  194.73  avg =  181.81
+      squeezenet_ssd  min =   96.64  max =  118.46  avg =  100.86
+ squeezenet_ssd_int8  min =   96.61  max =  123.48  avg =  104.64
+       mobilenet_ssd  min =   95.38  max =  110.52  avg =   98.61
+  mobilenet_ssd_int8  min =   76.21  max =   95.41  avg =   79.10
+      mobilenet_yolo  min =  210.73  max =  235.47  avg =  221.72
+  mobilenetv2_yolov3  min =  134.59  max =  154.33  avg =  139.54
+         yolov4-tiny  min =  167.79  max =  191.60  avg =  171.25
+           nanodet_m  min =   63.22  max =   80.73  avg =   66.25
+    yolo-fastest-1.1  min =   32.87  max =   88.05  avg =   47.36
+      yolo-fastestv2  min =   26.03  max =   27.01  avg =   26.54
+  vision_transformer  min = 3682.51  max = 3882.79  avg = 3809.42
+          FastestDet  min =   30.69  max =   50.65  avg =   33.65
+```
+
 ### Avaota Aim T527, Allwinner T527 (Cortex-A55 2.2GHz x 4 + Cortex-A55 1.8GHz x 4)
 
 ```
@@ -2647,6 +2691,7 @@ cooling_down = 0
 ```
 
 ### Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4)
+
 ```
 nanopc-t4:/data/local/tmp # ./benchncnn 8 2 2 -1 1
 loop_count = 8
@@ -2845,7 +2890,95 @@ cooling_down = 0
       yolo-fastestv2  min =   24.94  max =   25.07  avg =   25.01
 ```
 
+### MYIR RemiPi,Renesas RZG2L(Cortex-A55 1.5GHz x 2)
 
+```
+root@myir-remi-1g:~/ncnn# time ./benchncnn 10 4 0 -1 1
+loop_count = 10
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =   85.38  max =   87.72  avg =   86.78
+     squeezenet_int8  min =   84.23  max =   86.46  avg =   85.59
+           mobilenet  min =  121.01  max =  122.55  avg =  121.76
+      mobilenet_int8  min =   95.64  max =   97.27  avg =   96.25
+        mobilenet_v2  min =  101.35  max =  102.24  avg =  101.72
+        mobilenet_v3  min =   84.09  max =   86.66  avg =   84.86
+          shufflenet  min =   63.32  max =   65.16  avg =   64.53
+       shufflenet_v2  min =   60.33  max =   62.35  avg =   61.04
+             mnasnet  min =   95.51  max =   96.70  avg =   95.95
+     proxylessnasnet  min =  124.46  max =  125.82  avg =  125.14
+     efficientnet_b0  min =  144.94  max =  146.46  avg =  145.56
+   efficientnetv2_b0  min =  182.87  max =  185.63  avg =  184.56
+        regnety_400m  min =  105.31  max =  106.42  avg =  105.72
+           blazeface  min =   21.34  max =   21.90  avg =   21.50
+           googlenet  min =  313.01  max =  318.42  avg =  314.25
+      googlenet_int8  min =  301.87  max =  304.93  avg =  303.66
+            resnet18  min =  248.02  max =  253.93  avg =  250.12
+       resnet18_int8  min =  244.65  max =  246.62  avg =  245.66
+             alexnet  min =  204.00  max =  206.39  avg =  205.21
+            resnet50  min =  583.13  max =  584.82  avg =  584.11
+       resnet50_int8  min =  517.42  max =  520.97  avg =  519.07
+      squeezenet_ssd  min =  266.63  max =  273.34  avg =  268.60
+ squeezenet_ssd_int8  min =  255.42  max =  260.98  avg =  257.15
+       mobilenet_ssd  min =  267.16  max =  270.41  avg =  268.20
+  mobilenet_ssd_int8  min =  205.03  max =  206.43  avg =  205.53
+      mobilenet_yolo  min =  571.08  max =  576.15  avg =  574.18
+  mobilenetv2_yolov3  min =  342.52  max =  344.84  avg =  343.38
+         yolov4-tiny  min =  499.74  max =  503.13  avg =  501.45
+           nanodet_m  min =  161.87  max =  163.90  avg =  162.93
+    yolo-fastest-1.1  min =   72.84  max =   74.81  avg =   73.35
+      yolo-fastestv2  min =   68.24  max =   70.49  avg =   68.74
+  vision_transformer  min = 12464.09  max = 12491.57  avg = 12475.63
+          FastestDet  min =   67.92  max =   69.90  avg =   68.94
+```
+
+### OrangePi Zero 2, Allwinner H616 (Cortex-A53 1.5GHz x 4)
+
+```
+orangepi@zero2:~/ncnn/benchmark$ ./benchncnn 10 4 0 -1 1
+loop_count = 10
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =   76.25  max =   90.20  avg =   78.99
+     squeezenet_int8  min =   59.92  max =   60.44  avg =   60.10
+           mobilenet  min =  106.91  max =  132.22  avg =  109.99
+      mobilenet_int8  min =   57.96  max =   59.06  avg =   58.19
+        mobilenet_v2  min =   97.93  max =  124.48  avg =  100.91
+        mobilenet_v3  min =   82.27  max =   83.93  avg =   83.00
+          shufflenet  min =   55.27  max =   82.06  avg =   58.40
+       shufflenet_v2  min =   44.94  max =   71.99  avg =   48.10
+             mnasnet  min =   90.66  max =   91.41  avg =   90.92
+     proxylessnasnet  min =   91.55  max =  118.74  avg =   94.71
+     efficientnet_b0  min =  127.95  max =  155.13  avg =  131.25
+   efficientnetv2_b0  min =  145.96  max =  173.67  avg =  149.36
+        regnety_400m  min =  102.83  max =  103.52  avg =  103.08
+           blazeface  min =   14.46  max =   14.95  avg =   14.77
+           googlenet  min =  217.71  max =  244.16  avg =  221.38
+      googlenet_int8  min =  163.04  max =  187.69  avg =  166.20
+            resnet18  min =  251.45  max =  277.52  avg =  255.00
+       resnet18_int8  min =  136.54  max =  161.95  avg =  141.60
+             alexnet  min =  212.07  max =  233.27  avg =  215.34
+               vgg16  min = 1206.92  max = 1981.79  avg = 1673.28
+          vgg16_int8  min =  622.93  max =  702.12  avg =  661.83
+            resnet50  min =  555.84  max =  643.69  avg =  576.17
+       resnet50_int8  min =  348.11  max =  374.25  avg =  354.17
+      squeezenet_ssd  min =  224.68  max =  251.32  avg =  230.59
+ squeezenet_ssd_int8  min =  154.87  max =  182.66  avg =  159.08
+       mobilenet_ssd  min =  238.49  max =  426.65  avg =  263.18
+  mobilenet_ssd_int8  min =  118.36  max =  138.39  avg =  120.78
+      mobilenet_yolo  min =  500.28  max =  615.83  avg =  553.59
+  mobilenetv2_yolov3  min =  340.27  max =  369.13  avg =  347.17
+         yolov4-tiny  min =  365.04  max =  408.48  avg =  383.93
+           nanodet_m  min =  112.88  max =  141.85  avg =  116.13
+    yolo-fastest-1.1  min =   72.05  max =   73.46  avg =   72.68
+      yolo-fastestv2  min =   54.94  max =   55.35  avg =   55.15
+  vision_transformer  min = 6842.19  max = 9125.07  avg = 7343.64
+          FastestDet  min =   59.09  max =   59.87  avg =   59.35
+```
 
 ### OrangePi4 LTS, Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4)
 Test Ubuntu 22.04 Gnome Desktop
@@ -6803,98 +6936,54 @@ cooling_down = 0
           FastestDet  min =    4.34  max =    7.47  avg =    5.18
 ```
 
-### AWS c5.4xlarge Instance (Intel Xeon Platinum 8124M @ 3.399GHz, Ubuntu 20.04.6 LTS x86_64)
+### AWS c5.4xlarge Instance
 
-icpc (ICC) 2021.9.0 20230302
+- OS: Ubuntu 20.04.6 LTS x86_64
+- CPU: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
+- Compiler: gcc version 9.4.0 (Ubuntu 9.4.0-1ubuntu1~20.04.2)
+- ncnn tag: 20240102
 
 ```
-root@ip-172-31-3-216:/opt/ncnn-icc/benchmark# ../build/benchmark/benchncnn 4 8 0 -1
-loop_count = 4
-num_threads = 8
-powersave = 0
-gpu_device = -1
-cooling_down = 1
-          squeezenet  min =    3.23  max =    3.25  avg =    3.24
-     squeezenet_int8  min =    5.97  max =    6.00  avg =    5.98
-           mobilenet  min =    3.44  max =    3.56  avg =    3.51
-      mobilenet_int8  min =    4.53  max =    4.63  avg =    4.59
-        mobilenet_v2  min =    3.82  max =    3.95  avg =    3.88
-        mobilenet_v3  min =    3.49  max =    3.53  avg =    3.51
-          shufflenet  min =    3.56  max =    3.58  avg =    3.57
-       shufflenet_v2  min =    3.47  max =    3.52  avg =    3.48
-             mnasnet  min =    3.69  max =    4.07  avg =    3.94
-     proxylessnasnet  min =    3.75  max =    3.81  avg =    3.78
-     efficientnet_b0  min =    4.80  max =    4.85  avg =    4.82
-   efficientnetv2_b0  min =    6.51  max =    6.65  avg =    6.59
-        regnety_400m  min =    8.20  max =   10.79  avg =    9.04
-           blazeface  min =    1.16  max =    1.19  avg =    1.18
-           googlenet  min =    9.92  max =   11.07  avg =   10.56
-      googlenet_int8  min =   21.50  max =   21.67  avg =   21.59
-            resnet18  min =    6.47  max =    6.57  avg =    6.52
-       resnet18_int8  min =   19.01  max =   19.17  avg =   19.05
-             alexnet  min =    5.20  max =    5.28  avg =    5.25
-               vgg16  min =   35.74  max =   35.88  avg =   35.79
-          vgg16_int8  min =   38.20  max =   38.51  avg =   38.36
-            resnet50  min =   14.02  max =   14.12  avg =   14.05
-       resnet50_int8  min =   27.92  max =   28.12  avg =   28.03
-      squeezenet_ssd  min =    9.51  max =    9.70  avg =    9.60
- squeezenet_ssd_int8  min =   12.91  max =   13.06  avg =   12.97
-       mobilenet_ssd  min =    6.55  max =    6.65  avg =    6.59
-  mobilenet_ssd_int8  min =    9.16  max =    9.23  avg =    9.20
-      mobilenet_yolo  min =   17.02  max =   17.22  avg =   17.13
-  mobilenetv2_yolov3  min =   12.67  max =   12.78  avg =   12.71
-         yolov4-tiny  min =   23.42  max =   23.49  avg =   23.46
-           nanodet_m  min =    7.27  max =    7.30  avg =    7.28
-    yolo-fastest-1.1  min =    4.05  max =    4.08  avg =    4.06
-      yolo-fastestv2  min =    4.12  max =    4.15  avg =    4.13
-  vision_transformer  min =  135.25  max =  136.36  avg =  135.71
-          FastestDet  min =    4.12  max =    4.21  avg =    4.16
-```
-
-Intel(R) oneAPI DPC++/C++ Compiler 2023.1.0 (2023.1.0.20230320)
-
-```
-root@ip-172-31-3-216:/opt/ncnn-icx/benchmark# ../build/benchmark/benchncnn 4 8 0 -1
 loop_count = 4
 num_threads = 8
-powersave = 0
+powersave = 2
 gpu_device = -1
 cooling_down = 1
-          squeezenet  min =    3.13  max =    3.18  avg =    3.16
-     squeezenet_int8  min =    4.04  max =    4.07  avg =    4.06
-           mobilenet  min =    2.99  max =    3.05  avg =    3.03
-      mobilenet_int8  min =    3.68  max =    3.77  avg =    3.73
-        mobilenet_v2  min =    3.88  max =    3.92  avg =    3.89
-        mobilenet_v3  min =    3.60  max =    3.70  avg =    3.64
-          shufflenet  min =    3.52  max =    3.54  avg =    3.53
-       shufflenet_v2  min =    3.61  max =    3.64  avg =    3.63
-             mnasnet  min =    3.51  max =    3.53  avg =    3.52
-     proxylessnasnet  min =    3.73  max =    3.78  avg =    3.75
-     efficientnet_b0  min =    4.86  max =    4.95  avg =    4.91
-   efficientnetv2_b0  min =    6.84  max =    6.97  avg =    6.91
-        regnety_400m  min =    7.83  max =    7.89  avg =    7.86
-           blazeface  min =    1.10  max =    1.13  avg =    1.11
-           googlenet  min =    9.80  max =    9.89  avg =    9.83
-      googlenet_int8  min =   11.32  max =   11.42  avg =   11.37
-            resnet18  min =    6.68  max =    6.74  avg =    6.72
-       resnet18_int8  min =    8.86  max =    8.92  avg =    8.90
-             alexnet  min =    5.21  max =    5.25  avg =    5.22
-               vgg16  min =   35.77  max =   35.92  avg =   35.88
-          vgg16_int8  min =   29.64  max =   29.79  avg =   29.75
-            resnet50  min =   14.11  max =   14.31  avg =   14.22
-       resnet50_int8  min =   17.73  max =   18.01  avg =   17.86
-      squeezenet_ssd  min =    9.57  max =    9.65  avg =    9.61
- squeezenet_ssd_int8  min =    9.57  max =    9.67  avg =    9.63
-       mobilenet_ssd  min =    6.56  max =    6.61  avg =    6.59
-  mobilenet_ssd_int8  min =    7.51  max =    7.72  avg =    7.58
-      mobilenet_yolo  min =   16.89  max =   17.08  avg =   17.00
-  mobilenetv2_yolov3  min =   13.79  max =   13.93  avg =   13.86
-         yolov4-tiny  min =   24.91  max =   25.08  avg =   24.98
-           nanodet_m  min =    7.42  max =    7.47  avg =    7.44
-    yolo-fastest-1.1  min =    4.02  max =    4.09  avg =    4.07
-      yolo-fastestv2  min =    4.02  max =    4.04  avg =    4.03
-  vision_transformer  min =  135.54  max =  136.68  avg =  136.21
-          FastestDet  min =    4.06  max =    4.10  avg =    4.08
+          squeezenet  min =    3.31  max =    3.33  avg =    3.32
+     squeezenet_int8  min =    3.87  max =    4.34  avg =    4.07
+           mobilenet  min =    3.12  max =    3.20  avg =    3.17
+      mobilenet_int8  min =    3.32  max =    3.45  avg =    3.38
+        mobilenet_v2  min =    4.23  max =    4.43  avg =    4.33
+        mobilenet_v3  min =    3.82  max =    3.92  avg =    3.87
+          shufflenet  min =    3.67  max =    3.72  avg =    3.69
+       shufflenet_v2  min =    4.08  max =    4.22  avg =    4.15
+             mnasnet  min =    3.62  max =    3.69  avg =    3.64
+     proxylessnasnet  min =    4.29  max =    4.59  avg =    4.37
+     efficientnet_b0  min =    5.32  max =    5.64  avg =    5.50
+   efficientnetv2_b0  min =    6.81  max =    6.88  avg =    6.85
+        regnety_400m  min =    9.71  max =    9.77  avg =    9.74
+           blazeface  min =    1.71  max =    2.57  avg =    2.10
+           googlenet  min =   10.00  max =   10.09  avg =   10.05
+      googlenet_int8  min =    8.76  max =    8.79  avg =    8.77
+            resnet18  min =    6.55  max =    6.91  avg =    6.70
+       resnet18_int8  min =    5.63  max =    5.95  avg =    5.81
+             alexnet  min =    4.88  max =    4.91  avg =    4.89
+               vgg16  min =   36.99  max =   37.04  avg =   37.01
+          vgg16_int8  min =   28.13  max =   28.57  avg =   28.31
+            resnet50  min =   13.99  max =   14.13  avg =   14.06
+       resnet50_int8  min =   12.49  max =   12.56  avg =   12.53
+      squeezenet_ssd  min =    9.93  max =   10.04  avg =    9.98
+ squeezenet_ssd_int8  min =    9.51  max =    9.70  avg =    9.59
+       mobilenet_ssd  min =    6.60  max =    6.63  avg =    6.61
+  mobilenet_ssd_int8  min =    6.95  max =    7.10  avg =    7.02
+      mobilenet_yolo  min =   18.28  max =   18.44  avg =   18.35
+  mobilenetv2_yolov3  min =   13.26  max =   13.39  avg =   13.32
+         yolov4-tiny  min =   25.14  max =   25.58  avg =   25.37
+           nanodet_m  min =    7.71  max =    7.77  avg =    7.75
+    yolo-fastest-1.1  min =    4.69  max =    4.96  avg =    4.81
+      yolo-fastestv2  min =    4.84  max =    5.17  avg =    5.01
+  vision_transformer  min =  139.34  max =  140.38  avg =  139.96
+          FastestDet  min =    4.95  max =    5.12  avg =    5.06
 ```
 
 ### Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-11800H, NVIDIA GeForce RTX 3070 Laptop GPU)
@@ -7408,50 +7497,51 @@ cooling_down = 0
 
 - Platform: Xunlei OneCloud (玩客云)
 - OS: Armbian buster (20.12) armv7l
-- ncnn tag: 20231027
+- Compiler: gcc version 8.3.0 (Debian 8.3.0-6)
+- ncnn tag: 20240102
 
 ```
-mizu-bai@aml-s812:~/ncnn-20231027/benchmark$ ../build/benchmark/benchncnn 4 4 0 -1 1
+mizu-bai@aml-s812:~/ncnn-20240102/benchmark$ ../build/benchmark/benchncnn
 loop_count = 4
 num_threads = 4
-powersave = 0
+powersave = 2
 gpu_device = -1
 cooling_down = 1
-          squeezenet  min =  449.65  max =  636.24  avg =  549.81
-     squeezenet_int8  min =  271.84  max =  471.03  avg =  418.24
-           mobilenet  min =  874.01  max = 1027.19  avg =  927.64
-      mobilenet_int8  min =  358.16  max =  555.39  avg =  477.83
-        mobilenet_v2  min =  455.49  max =  802.61  avg =  598.32
-        mobilenet_v3  min =  388.48  max =  620.12  avg =  535.33
-          shufflenet  min =  269.15  max =  497.81  avg =  352.46
-       shufflenet_v2  min =  220.64  max =  396.63  avg =  305.29
-             mnasnet  min =  422.92  max =  760.36  avg =  594.63
-     proxylessnasnet  min =  522.79  max =  889.06  avg =  742.49
-     efficientnet_b0  min =  922.67  max = 1014.29  avg =  971.97
-   efficientnetv2_b0  min = 1022.19  max = 1153.30  avg = 1092.78
-        regnety_400m  min =  652.96  max =  972.85  avg =  838.15
-           blazeface  min =   70.44  max =  131.64  avg =   93.18
-           googlenet  min = 1599.90  max = 1789.44  avg = 1701.07
-      googlenet_int8  min =  925.61  max = 1185.10  avg = 1055.48
-            resnet18  min = 1318.23  max = 1586.19  avg = 1422.16
-       resnet18_int8  min =  558.06  max =  881.32  avg =  777.04
-             alexnet  min =  755.06  max = 1109.70  avg =  941.45
-               vgg16  min = 6984.48  max = 7085.39  avg = 7024.48
-          vgg16_int8  min = 3986.30  max = 4011.83  avg = 3997.05
-            resnet50  min = 4196.40  max = 4256.91  avg = 4234.42
-       resnet50_int8  min = 2403.39  max = 2630.50  avg = 2512.29
-      squeezenet_ssd  min = 1039.30  max = 1411.58  avg = 1199.95
- squeezenet_ssd_int8  min =  742.65  max =  952.13  avg =  812.48
-       mobilenet_ssd  min = 1772.72  max = 1993.07  avg = 1906.95
-  mobilenet_ssd_int8  min =  893.49  max = 1076.65  avg =  998.14
-      mobilenet_yolo  min = 4177.06  max = 4403.88  avg = 4300.91
-  mobilenetv2_yolov3  min = 2182.82  max = 2240.35  avg = 2207.98
-         yolov4-tiny  min = 2441.59  max = 2817.29  avg = 2594.33
-           nanodet_m  min =  577.75  max =  925.98  avg =  803.88
-    yolo-fastest-1.1  min =  247.65  max =  497.52  avg =  311.91
-      yolo-fastestv2  min =  207.27  max =  398.13  avg =  314.22
-  vision_transformer  min = 18775.75  max = 19008.69  avg = 18906.63
-          FastestDet  min =  296.48  max =  466.79  avg =  354.53
+          squeezenet  min =  376.45  max =  445.48  avg =  408.08
+     squeezenet_int8  min =  247.06  max =  340.34  avg =  281.40
+           mobilenet  min =  696.71  max =  745.63  avg =  718.49
+      mobilenet_int8  min =  355.78  max =  472.06  avg =  401.17
+        mobilenet_v2  min =  428.86  max =  491.25  avg =  458.45
+        mobilenet_v3  min =  361.78  max =  425.90  avg =  396.94
+          shufflenet  min =  245.90  max =  333.41  avg =  293.46
+       shufflenet_v2  min =  210.69  max =  329.51  avg =  260.73
+             mnasnet  min =  418.49  max =  493.40  avg =  448.95
+     proxylessnasnet  min =  542.20  max =  566.65  avg =  554.75
+     efficientnet_b0  min =  727.72  max =  785.47  avg =  750.72
+   efficientnetv2_b0  min =  805.70  max =  874.57  avg =  843.87
+        regnety_400m  min =  627.74  max =  686.57  avg =  660.60
+           blazeface  min =   62.14  max =  121.32  avg =   82.10
+           googlenet  min = 1295.31  max = 1411.88  avg = 1342.26
+      googlenet_int8  min =  796.39  max =  860.28  avg =  823.76
+            resnet18  min = 1076.93  max = 1125.12  avg = 1099.37
+       resnet18_int8  min =  587.12  max =  634.97  avg =  605.29
+             alexnet  min =  701.70  max =  729.68  avg =  718.99
+               vgg16  min = 5584.13  max = 5748.84  avg = 5660.70
+          vgg16_int8  min = 3107.89  max = 3138.78  avg = 3121.28
+            resnet50  min = 3378.84  max = 3461.61  avg = 3425.38
+       resnet50_int8  min = 2044.93  max = 2067.70  avg = 2061.38
+      squeezenet_ssd  min =  908.77  max =  972.68  avg =  939.98
+ squeezenet_ssd_int8  min =  609.58  max =  703.88  avg =  662.43
+       mobilenet_ssd  min = 1524.69  max = 1589.79  avg = 1552.12
+  mobilenet_ssd_int8  min =  817.70  max =  885.45  avg =  840.30
+      mobilenet_yolo  min = 3497.13  max = 3605.83  avg = 3543.72
+  mobilenetv2_yolov3  min = 1734.10  max = 1824.98  avg = 1795.42
+         yolov4-tiny  min = 2093.70  max = 2163.44  avg = 2128.30
+           nanodet_m  min =  593.75  max =  647.03  avg =  608.03
+    yolo-fastest-1.1  min =  228.68  max =  318.40  avg =  265.74
+      yolo-fastestv2  min =  194.29  max =  258.78  avg =  219.82
+  vision_transformer  min = 14836.43  max = 15238.27  avg = 15125.26
+          FastestDet  min =  215.60  max =  264.69  avg =  239.85
 ```
 
 ### Qualcomm SM8550-AB Snapdragon 8 Gen 2 (Kyro 3.20 GHz + 2.8 GHz x 2 + 2.80 GHz x 2 + 2.00 GHz * 3 + Adreno 740)
@@ -7670,3 +7760,408 @@ cooling_down = 0
   vision_transformer  min =  650.85  max =  696.67  avg =  671.13
           FastestDet  min =    8.63  max =   13.12  avg =   11.39
 ```
+
+### MediaTek Dimensity 9300 (MT6989) (Cortex-X4 3.25 GHz + 2.85 GHz x 3 + Cortex-A720 2.0 GHz x 4 + Mali-G720-Immortalis MC12)
+```
+k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 8 0 -1 1                                           
+loop_count = 8
+num_threads = 8
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =    1.87  max =    2.18  avg =    2.01
+     squeezenet_int8  min =    1.52  max =    1.98  avg =    1.77
+           mobilenet  min =    3.02  max =    3.34  avg =    3.15
+      mobilenet_int8  min =    1.90  max =    2.27  avg =    2.04
+        mobilenet_v2  min =    2.72  max =    3.13  avg =    2.89
+        mobilenet_v3  min =    2.20  max =    3.82  avg =    2.78
+          shufflenet  min =    1.97  max =    2.56  avg =    2.20
+       shufflenet_v2  min =    1.77  max =    2.29  avg =    1.96
+             mnasnet  min =    2.61  max =    3.48  avg =    2.90
+     proxylessnasnet  min =    2.72  max =    3.06  avg =    2.89
+     efficientnet_b0  min =    4.57  max =    5.17  avg =    4.89
+   efficientnetv2_b0  min =    5.24  max =    6.72  avg =    5.81
+        regnety_400m  min =    4.94  max =    6.78  avg =    5.70
+           blazeface  min =    0.80  max =    1.02  avg =    0.91
+           googlenet  min =    7.76  max =    8.53  avg =    8.12
+      googlenet_int8  min =    5.68  max =    6.62  avg =    6.19
+            resnet18  min =    5.35  max =    6.06  avg =    5.61
+       resnet18_int8  min =    4.20  max =    4.40  avg =    4.29
+             alexnet  min =    5.96  max =    7.30  avg =    6.77
+               vgg16  min =   29.27  max =   30.58  avg =   29.93
+          vgg16_int8  min =   26.72  max =   28.12  avg =   27.27
+            resnet50  min =   15.21  max =   19.16  avg =   16.09
+       resnet50_int8  min =    8.57  max =    9.16  avg =    8.91
+      squeezenet_ssd  min =    6.29  max =    7.56  avg =    6.82
+ squeezenet_ssd_int8  min =    5.57  max =    6.96  avg =    6.12
+       mobilenet_ssd  min =    6.90  max =    8.90  avg =    7.55
+  mobilenet_ssd_int8  min =    4.53  max =    5.22  avg =    4.86
+      mobilenet_yolo  min =   16.88  max =   19.71  avg =   17.88
+  mobilenetv2_yolov3  min =   10.51  max =   14.19  avg =   11.95
+         yolov4-tiny  min =   12.81  max =   16.23  avg =   14.22
+           nanodet_m  min =    4.38  max =    5.96  avg =    5.19
+    yolo-fastest-1.1  min =    2.22  max =    3.08  avg =    2.73
+      yolo-fastestv2  min =    2.09  max =    2.73  avg =    2.41
+  vision_transformer  min =  193.39  max =  203.13  avg =  198.32
+          FastestDet  min =    1.98  max =    2.35  avg =    2.16
+k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 4 2 -1 1                                           
+loop_count = 8
+num_threads = 4
+powersave = 2
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =    2.23  max =    2.31  avg =    2.27
+     squeezenet_int8  min =    1.68  max =    1.73  avg =    1.70
+           mobilenet  min =    3.76  max =    3.86  avg =    3.81
+      mobilenet_int8  min =    2.07  max =    2.16  avg =    2.11
+        mobilenet_v2  min =    2.72  max =    2.95  avg =    2.80
+        mobilenet_v3  min =    2.43  max =    2.51  avg =    2.47
+          shufflenet  min =    1.78  max =    1.87  avg =    1.81
+       shufflenet_v2  min =    1.61  max =    1.66  avg =    1.63
+             mnasnet  min =    2.69  max =    2.82  avg =    2.76
+     proxylessnasnet  min =    2.95  max =    3.13  avg =    3.05
+     efficientnet_b0  min =    4.99  max =    5.29  avg =    5.08
+   efficientnetv2_b0  min =    5.73  max =    5.86  avg =    5.79
+        regnety_400m  min =    4.97  max =    5.04  avg =    5.00
+           blazeface  min =    1.07  max =    1.17  avg =    1.10
+           googlenet  min =    8.51  max =    9.43  avg =    8.75
+      googlenet_int8  min =    6.01  max =    6.13  avg =    6.07
+            resnet18  min =    6.72  max =    7.04  avg =    6.95
+       resnet18_int8  min =    4.31  max =    4.40  avg =    4.34
+             alexnet  min =    7.41  max =    7.71  avg =    7.57
+               vgg16  min =   33.77  max =   34.68  avg =   34.08
+          vgg16_int8  min =   32.61  max =   33.83  avg =   33.12
+            resnet50  min =   18.76  max =   19.53  avg =   19.05
+       resnet50_int8  min =    9.56  max =    9.70  avg =    9.61
+      squeezenet_ssd  min =    6.86  max =    7.26  avg =    7.01
+ squeezenet_ssd_int8  min =    5.42  max =    6.17  avg =    5.64
+       mobilenet_ssd  min =    8.38  max =    9.14  avg =    8.62
+  mobilenet_ssd_int8  min =    4.60  max =    4.90  avg =    4.69
+      mobilenet_yolo  min =   19.59  max =   20.06  avg =   19.78
+  mobilenetv2_yolov3  min =   10.46  max =   11.01  avg =   10.70
+         yolov4-tiny  min =   13.46  max =   14.18  avg =   13.86
+           nanodet_m  min =    4.52  max =    4.59  avg =    4.55
+    yolo-fastest-1.1  min =    1.88  max =    1.94  avg =    1.91
+      yolo-fastestv2  min =    1.73  max =    1.79  avg =    1.76
+  vision_transformer  min =  220.32  max =  229.49  avg =  223.92
+          FastestDet  min =    1.67  max =    1.73  avg =    1.70
+k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 4 1 -1 1                                           
+loop_count = 8
+num_threads = 4
+powersave = 1
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =    3.42  max =    4.25  avg =    3.62
+     squeezenet_int8  min =    2.63  max =    2.78  avg =    2.73
+           mobilenet  min =    5.66  max =    6.25  avg =    5.82
+      mobilenet_int8  min =    3.13  max =    5.66  avg =    3.58
+        mobilenet_v2  min =    4.40  max =    4.46  avg =    4.42
+        mobilenet_v3  min =    3.74  max =    4.07  avg =    3.94
+          shufflenet  min =    2.77  max =    2.86  avg =    2.82
+       shufflenet_v2  min =    2.52  max =    2.62  avg =    2.57
+             mnasnet  min =    4.24  max =    4.37  avg =    4.28
+     proxylessnasnet  min =    4.65  max =    4.91  avg =    4.74
+     efficientnet_b0  min =    7.71  max =   10.00  avg =    8.08
+   efficientnetv2_b0  min =    9.24  max =   10.34  avg =    9.87
+        regnety_400m  min =    7.87  max =    8.35  avg =    8.02
+           blazeface  min =    2.38  max =    2.46  avg =    2.40
+           googlenet  min =   13.21  max =   13.78  avg =   13.40
+      googlenet_int8  min =   10.23  max =   10.65  avg =   10.36
+            resnet18  min =    9.25  max =    9.68  avg =    9.49
+       resnet18_int8  min =    6.86  max =    6.97  avg =    6.91
+             alexnet  min =    9.73  max =   10.53  avg =    9.97
+               vgg16  min =   47.43  max =   48.12  avg =   47.78
+          vgg16_int8  min =   47.08  max =   48.18  avg =   47.46
+            resnet50  min =   26.82  max =   27.14  avg =   26.99
+       resnet50_int8  min =   15.01  max =   15.57  avg =   15.20
+      squeezenet_ssd  min =    9.96  max =   12.66  avg =   10.83
+ squeezenet_ssd_int8  min =    8.47  max =    9.26  avg =    8.88
+       mobilenet_ssd  min =   12.54  max =   13.25  avg =   12.82
+  mobilenet_ssd_int8  min =    7.03  max =   10.91  avg =    7.94
+      mobilenet_yolo  min =   29.73  max =   30.45  avg =   30.23
+  mobilenetv2_yolov3  min =   16.64  max =   17.71  avg =   17.13
+         yolov4-tiny  min =   22.25  max =   22.65  avg =   22.45
+           nanodet_m  min =    7.56  max =    7.86  avg =    7.69
+    yolo-fastest-1.1  min =    3.32  max =    3.45  avg =    3.39
+      yolo-fastestv2  min =    2.76  max =    2.96  avg =    2.84
+  vision_transformer  min =  328.11  max =  337.26  avg =  332.12
+          FastestDet  min =    2.66  max =    2.77  avg =    2.71
+k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 1 2 -1 1                                           
+loop_count = 8
+num_threads = 1
+powersave = 2
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =    5.27  max =    5.35  avg =    5.32
+     squeezenet_int8  min =    3.06  max =    3.22  avg =    3.16
+           mobilenet  min =    9.59  max =    9.85  avg =    9.74
+      mobilenet_int8  min =    4.29  max =    4.45  avg =    4.37
+        mobilenet_v2  min =    5.14  max =    5.33  avg =    5.20
+        mobilenet_v3  min =    4.28  max =    4.54  avg =    4.42
+          shufflenet  min =    3.18  max =    3.34  avg =    3.27
+       shufflenet_v2  min =    2.78  max =    3.23  avg =    3.05
+             mnasnet  min =    5.01  max =    5.38  avg =    5.19
+     proxylessnasnet  min =    6.11  max =    6.30  avg =    6.21
+     efficientnet_b0  min =   11.53  max =   11.78  avg =   11.66
+   efficientnetv2_b0  min =   13.88  max =   14.28  avg =   14.13
+        regnety_400m  min =    8.11  max =    8.18  avg =    8.16
+           blazeface  min =    0.99  max =    1.08  avg =    1.01
+           googlenet  min =   19.68  max =   20.71  avg =   20.25
+      googlenet_int8  min =   13.42  max =   13.86  avg =   13.60
+            resnet18  min =   18.10  max =   18.84  avg =   18.53
+       resnet18_int8  min =    9.67  max =   10.17  avg =    9.99
+             alexnet  min =   15.76  max =   16.35  avg =   16.03
+               vgg16  min =   70.22  max =   72.85  avg =   71.58
+          vgg16_int8  min =   76.83  max =   79.70  avg =   78.45
+            resnet50  min =   39.73  max =   41.24  avg =   40.30
+       resnet50_int8  min =   20.76  max =   21.54  avg =   21.27
+      squeezenet_ssd  min =   12.63  max =   18.67  avg =   15.20
+ squeezenet_ssd_int8  min =   10.29  max =   16.13  avg =   14.13
+       mobilenet_ssd  min =   17.21  max =   18.43  avg =   17.68
+  mobilenet_ssd_int8  min =    8.92  max =    9.49  avg =    9.07
+      mobilenet_yolo  min =   37.45  max =   38.29  avg =   37.88
+  mobilenetv2_yolov3  min =   19.18  max =   19.83  avg =   19.58
+         yolov4-tiny  min =   27.06  max =   27.86  avg =   27.45
+           nanodet_m  min =    9.33  max =    9.50  avg =    9.42
+    yolo-fastest-1.1  min =    3.48  max =    3.59  avg =    3.54
+      yolo-fastestv2  min =    2.29  max =    2.37  avg =    2.33
+  vision_transformer  min =  730.38  max =  739.99  avg =  735.77
+          FastestDet  min =    2.40  max =    2.48  avg =    2.43
+k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 64 1 2 0 0                                           
+[0 Mali-G720-Immortalis MC12]  queueC=0[2]  queueG=0[2]  queueT=0[2]
+[0 Mali-G720-Immortalis MC12]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
+[0 Mali-G720-Immortalis MC12]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
+[0 Mali-G720-Immortalis MC12]  subgroup=16  basic/vote/ballot/shuffle=1/1/1/1
+[0 Mali-G720-Immortalis MC12]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
+loop_count = 64
+num_threads = 1
+powersave = 2
+gpu_device = 0
+cooling_down = 0
+          squeezenet  min =   11.26  max =   13.58  avg =   12.32
+     squeezenet_int8  min =    3.08  max =    3.29  avg =    3.17
+           mobilenet  min =   11.96  max =   14.52  avg =   13.48
+      mobilenet_int8  min =    4.20  max =    4.58  avg =    4.34
+        mobilenet_v2  min =   13.62  max =   16.46  avg =   14.62
+        mobilenet_v3  min =   13.98  max =   17.16  avg =   15.25
+          shufflenet  min =   10.22  max =   11.82  avg =   11.07
+       shufflenet_v2  min =   12.42  max =   15.39  avg =   14.35
+             mnasnet  min =   12.94  max =   16.30  avg =   14.91
+     proxylessnasnet  min =   13.18  max =   16.55  avg =   15.05
+     efficientnet_b0  min =   16.70  max =   20.35  avg =   18.27
+   efficientnetv2_b0  min =   54.09  max =   70.05  avg =   58.68
+        regnety_400m  min =   16.20  max =   18.42  avg =   17.27
+           blazeface  min =    6.50  max =    7.86  avg =    6.93
+           googlenet  min =   15.29  max =   17.54  avg =   16.19
+      googlenet_int8  min =   20.38  max =   22.08  avg =   20.98
+            resnet18  min =   12.22  max =   15.63  avg =   14.27
+       resnet18_int8  min =    9.50  max =   10.46  avg =    9.75
+             alexnet  min =   12.00  max =   16.09  avg =   13.65
+               vgg16  min =   31.06  max =   32.77  avg =   31.85
+          vgg16_int8  min =  115.72  max =  123.71  avg =  118.23
+            resnet50  min =   15.74  max =   16.53  avg =   16.10
+       resnet50_int8  min =   32.43  max =   33.78  avg =   33.07
+      squeezenet_ssd  min =   17.24  max =   21.80  avg =   20.68
+ squeezenet_ssd_int8  min =    9.69  max =   10.52  avg =    9.97
+       mobilenet_ssd  min =   15.32  max =   17.63  avg =   16.62
+  mobilenet_ssd_int8  min =    8.84  max =    9.54  avg =    9.05
+      mobilenet_yolo  min =   16.67  max =   18.21  avg =   17.25
+  mobilenetv2_yolov3  min =   20.08  max =   25.40  avg =   23.12
+         yolov4-tiny  min =   21.98  max =   29.67  avg =   24.75
+           nanodet_m  min =   23.19  max =   29.95  avg =   25.69
+    yolo-fastest-1.1  min =   15.07  max =   17.78  avg =   16.49
+      yolo-fastestv2  min =   14.67  max =   16.07  avg =   15.44
+  vision_transformer  min =  768.04  max =  801.48  avg =  786.79
+          FastestDet  min =    8.33  max =   16.07  avg =   14.38
+```
+
+### Xeon Phi 3120A (1.10 GHz 57-core 228-thread)
+
+- Host: CentOS 7.9
+- Compiler: icc & icpc (ICC) 17.0.2 20170213
+- ncnn tag: 20240102
+
+Build command
+
+```bash
+$ CC=icc CXX=icpc CFLAGS="-mmic" CXXFLAGS="-mmic" cmake .. -DCMAKE_BUILD_TYPE=Release -DNCNN_SSE2=OFF -DNCNN_AVX=OFF -DNCNN_AVX2=OFF
+```
+
+Copy the whole `ncnn` directory and libraries in `/opt/intel/compilers_and_libraries_2017/linux/lib/mic/lib` to `mic0`, then set the `LD_LIBRARY_PATH` environment variable. Some tools cannot be built, but `benchncnn` should work. The built `benchncnn` is for Intel Xeon Phi coprocessor (k1om).
+
+```bash
+[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ file benchncnn 
+benchncnn: ELF 64-bit LSB executable, Intel Xeon Phi coprocessor (k1om), version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.32, not stripped
+```
+
+The benchmark is run in the native mode, ssh into the Xeon Phi by `ssh user@mic0`, then run `benckncnn` as under general linux systems.
+
+```
+[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 56 0 -1 1
+loop_count = 4
+num_threads = 56
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =   43.42  max =   44.20  avg =   43.64
+     squeezenet_int8  min =  161.92  max =  162.41  avg =  162.15
+           mobilenet  min =   44.49  max =   46.90  avg =   45.68
+      mobilenet_int8  min =  230.47  max =  232.40  avg =  231.77
+        mobilenet_v2  min =   57.22  max =   62.03  avg =   59.42
+        mobilenet_v3  min =  301.16  max =  306.62  avg =  303.90
+          shufflenet  min =   65.80  max =   70.18  avg =   67.70
+       shufflenet_v2  min =   49.54  max =   53.17  avg =   51.22
+             mnasnet  min =  521.87  max =  527.76  avg =  524.63
+     proxylessnasnet  min =  745.79  max =  748.55  avg =  746.92
+     efficientnet_b0  min =  582.21  max =  584.64  avg =  583.34
+   efficientnetv2_b0  min =   84.13  max =   86.13  avg =   85.19
+        regnety_400m  min =  209.67  max =  214.84  avg =  212.39
+           blazeface  min =   26.33  max =   27.39  avg =   26.74
+           googlenet  min =  124.14  max =  125.72  avg =  124.83
+      googlenet_int8  min =  498.36  max =  502.37  avg =  500.29
+            resnet18  min =   87.86  max =   88.83  avg =   88.35
+       resnet18_int8  min =  359.50  max =  360.71  avg =  360.11
+             alexnet  min =   49.87  max =   51.25  avg =   50.76
+               vgg16  min =  341.87  max =  343.92  avg =  342.42
+          vgg16_int8  min = 1649.34  max = 1655.37  avg = 1652.98
+            resnet50  min =  198.91  max =  202.32  avg =  200.58
+       resnet50_int8  min =  983.48  max =  988.73  avg =  986.22
+      squeezenet_ssd  min =  108.33  max =  111.45  avg =  110.18
+ squeezenet_ssd_int8  min =  368.96  max =  370.30  avg =  369.54
+       mobilenet_ssd  min =   98.29  max =  101.49  avg =   99.99
+  mobilenet_ssd_int8  min =  462.18  max =  466.20  avg =  464.85
+      mobilenet_yolo  min =  262.42  max =  266.84  avg =  263.91
+  mobilenetv2_yolov3  min =  159.20  max =  161.58  avg =  160.66
+         yolov4-tiny  min =  229.22  max =  230.48  avg =  229.87
+           nanodet_m  min =  115.10  max =  116.78  avg =  115.86
+    yolo-fastest-1.1  min =  154.48  max =  155.33  avg =  154.79
+      yolo-fastestv2  min =  161.10  max =  163.98  avg =  161.88
+  vision_transformer  min =  848.51  max =  863.03  avg =  854.92
+          FastestDet  min =  251.64  max =  253.22  avg =  252.38
+[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 112 0 -1 1
+loop_count = 4
+num_threads = 112
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =   41.07  max =   41.19  avg =   41.12
+     squeezenet_int8  min =  161.73  max =  163.90  avg =  162.74
+           mobilenet  min =   36.82  max =   37.53  avg =   37.11
+      mobilenet_int8  min =  231.50  max =  233.81  avg =  232.65
+        mobilenet_v2  min =   53.12  max =   55.87  avg =   54.44
+        mobilenet_v3  min =  277.82  max =  280.61  avg =  279.66
+          shufflenet  min =   64.11  max =   64.92  avg =   64.63
+       shufflenet_v2  min =   48.23  max =   50.00  avg =   49.19
+             mnasnet  min =  532.09  max =  534.73  avg =  533.34
+     proxylessnasnet  min =  760.43  max =  763.94  avg =  762.34
+     efficientnet_b0  min =  534.29  max =  547.51  avg =  541.29
+   efficientnetv2_b0  min =   75.94  max =   76.88  avg =   76.39
+        regnety_400m  min =  226.37  max =  227.81  avg =  227.23
+           blazeface  min =   26.03  max =   26.93  avg =   26.51
+           googlenet  min =  106.53  max =  107.54  avg =  107.06
+      googlenet_int8  min =  503.01  max =  505.16  avg =  504.13
+            resnet18  min =   73.63  max =   76.61  avg =   75.11
+       resnet18_int8  min =  358.18  max =  359.50  avg =  358.99
+             alexnet  min =   37.40  max =   38.17  avg =   37.83
+               vgg16  min =  244.95  max =  250.05  avg =  247.24
+          vgg16_int8  min = 1511.89  max = 1512.66  avg = 1512.35
+            resnet50  min =  151.99  max =  154.66  avg =  153.37
+       resnet50_int8  min =  954.16  max =  957.63  avg =  956.55
+      squeezenet_ssd  min =   91.46  max =   97.18  avg =   94.00
+ squeezenet_ssd_int8  min =  368.03  max =  375.96  avg =  370.99
+       mobilenet_ssd  min =   79.61  max =   81.38  avg =   80.33
+  mobilenet_ssd_int8  min =  458.93  max =  463.41  avg =  461.63
+      mobilenet_yolo  min =  234.59  max =  236.91  avg =  235.43
+  mobilenetv2_yolov3  min =  145.82  max =  146.92  avg =  146.23
+         yolov4-tiny  min =  219.22  max =  220.51  avg =  219.83
+           nanodet_m  min =  109.43  max =  113.94  avg =  112.20
+    yolo-fastest-1.1  min =  158.13  max =  160.59  avg =  159.20
+      yolo-fastestv2  min =  162.05  max =  162.80  avg =  162.47
+  vision_transformer  min =  615.14  max =  625.35  avg =  618.47
+          FastestDet  min =  279.98  max =  282.49  avg =  281.14
+[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 224 0 -1 1
+loop_count = 4
+num_threads = 224
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =   45.54  max =   46.81  avg =   46.13
+     squeezenet_int8  min =  186.81  max =  187.14  avg =  186.97
+           mobilenet  min =   38.33  max =   39.11  avg =   38.64
+      mobilenet_int8  min =  251.06  max =  251.91  avg =  251.40
+        mobilenet_v2  min =   56.57  max =   57.15  avg =   56.88
+        mobilenet_v3  min =  365.04  max =  366.87  avg =  365.94
+          shufflenet  min =   71.16  max =   72.02  avg =   71.68
+       shufflenet_v2  min =   52.14  max =   53.60  avg =   52.92
+             mnasnet  min =  596.37  max =  603.62  avg =  600.50
+     proxylessnasnet  min =  911.84  max =  912.23  avg =  912.04
+     efficientnet_b0  min =  611.77  max =  614.32  avg =  612.69
+   efficientnetv2_b0  min =   82.16  max =   83.05  avg =   82.62
+        regnety_400m  min =  253.43  max =  255.79  avg =  254.66
+           blazeface  min =   30.54  max =   30.91  avg =   30.70
+           googlenet  min =  111.68  max =  112.65  avg =  112.11
+      googlenet_int8  min =  594.07  max =  597.09  avg =  596.03
+            resnet18  min =   78.14  max =   79.12  avg =   78.75
+       resnet18_int8  min =  412.69  max =  413.92  avg =  413.46
+             alexnet  min =   40.93  max =   41.43  avg =   41.17
+               vgg16  min =  242.45  max =  244.46  avg =  243.47
+          vgg16_int8  min = 1545.61  max = 1548.72  avg = 1547.47
+            resnet50  min =  147.73  max =  148.56  avg =  148.07
+       resnet50_int8  min = 1034.47  max = 1042.31  avg = 1038.41
+      squeezenet_ssd  min =  107.82  max =  110.53  avg =  108.98
+ squeezenet_ssd_int8  min =  423.30  max =  426.91  avg =  425.67
+       mobilenet_ssd  min =   74.54  max =   77.13  avg =   75.97
+  mobilenet_ssd_int8  min =  510.95  max =  513.33  avg =  512.40
+      mobilenet_yolo  min =  238.83  max =  239.64  avg =  239.27
+  mobilenetv2_yolov3  min =  159.80  max =  160.31  avg =  160.04
+         yolov4-tiny  min =  233.89  max =  237.41  avg =  236.22
+           nanodet_m  min =  122.39  max =  123.42  avg =  122.89
+    yolo-fastest-1.1  min =  194.49  max =  195.25  avg =  194.94
+      yolo-fastestv2  min =  193.06  max =  195.03  avg =  194.05
+  vision_transformer  min =  547.36  max =  554.17  avg =  549.99
+          FastestDet  min =  317.76  max =  321.38  avg =  320.18
+```
+
+### PhytiumPi, Phytium E2000 (FTC664@1.8GHz x2 + FTC310@1.5GHz x2)
+```
+loop_count = 4
+num_threads = 2
+powersave = 2
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =   43.84  max =   43.95  avg =   43.88
+     squeezenet_int8  min =   35.48  max =   35.77  avg =   35.66
+           mobilenet  min =   69.31  max =   70.03  avg =   69.66
+      mobilenet_int8  min =   42.30  max =   42.40  avg =   42.35
+        mobilenet_v2  min =   59.07  max =   59.35  avg =   59.19
+        mobilenet_v3  min =   46.02  max =   46.37  avg =   46.19
+          shufflenet  min =   31.52  max =   31.61  avg =   31.56
+       shufflenet_v2  min =   23.99  max =   24.07  avg =   24.04
+             mnasnet  min =   49.40  max =   50.45  avg =   49.92
+     proxylessnasnet  min =   53.24  max =   53.85  avg =   53.53
+     efficientnet_b0  min =   77.49  max =   77.84  avg =   77.62
+   efficientnetv2_b0  min =   88.51  max =   88.92  avg =   88.69
+        regnety_400m  min =   66.99  max =   67.05  avg =   67.03
+           blazeface  min =    7.74  max =    8.14  avg =    7.98
+           googlenet  min =  126.62  max =  127.23  avg =  126.91
+      googlenet_int8  min =  102.87  max =  103.16  avg =  103.01
+            resnet18  min =  102.28  max =  102.63  avg =  102.48
+       resnet18_int8  min =   72.01  max =   72.45  avg =   72.29
+             alexnet  min =   76.00  max =  124.61  avg =   88.24
+               vgg16  min =  597.75  max =  601.99  avg =  599.44
+          vgg16_int8  min =  421.40  max =  423.83  avg =  423.01
+            resnet50  min =  278.16  max =  280.64  avg =  279.37
+       resnet50_int8  min =  207.26  max =  207.47  avg =  207.36
+      squeezenet_ssd  min =  108.69  max =  109.26  avg =  108.99
+ squeezenet_ssd_int8  min =   84.05  max =   84.60  avg =   84.28
+       mobilenet_ssd  min =  141.65  max =  142.46  avg =  142.14
+  mobilenet_ssd_int8  min =   84.43  max =   84.99  avg =   84.73
+      mobilenet_yolo  min =  322.53  max =  325.15  avg =  323.51
+  mobilenetv2_yolov3  min =  194.84  max =  196.98  avg =  196.07
+         yolov4-tiny  min =  208.29  max =  213.26  avg =  210.77
+           nanodet_m  min =   64.78  max =   65.38  avg =   65.08
+    yolo-fastest-1.1  min =   37.89  max =   38.23  avg =   38.07
+      yolo-fastestv2  min =   29.75  max =   30.33  avg =   30.09
+  vision_transformer  min = 4257.71  max = 4263.73  avg = 4260.60
+          FastestDet  min =   30.86  max =   44.67  avg =   34.41
+```
diff --git a/benchmark/RankCards/README.md b/benchmark/RankCards/README.md
index 1db9ced86ae..00cb164e50c 100644
--- a/benchmark/RankCards/README.md
+++ b/benchmark/RankCards/README.md
@@ -5,79 +5,88 @@ The set is then compared to a reference set by calculating the ratio of each mod
 Finally, the boards are ranked from fast to slow.<br>
 |      | Board | Ratio | 
 | :--: | :---- | :---  | 
-| 1 | NVIDIA Quadro RTX 8000 (TU102 SM x 72 + Tensor Core x 576) | 0.123 | 
-| 2 | nVIDIA RTX2080 of Desktop | 0.126 | 
-| 3 | NVIDIA GeForce RTX 3060 Ti of Desktop[2023-10-12] | 0.136 | 
-| 4 | nVIDIA RTX2060 of Notebook | 0.167 | 
-| 5 | Intel® Core™ i7-13700K of Desktop[2023-10-12] | 0.199 | 
-| 6 | NVIDIA RTX3090 (GA102 SM x 82 + Tensor Core 328) | 0.204 | 
-| 7 | AMD Radeon RX 6900 XT of Desktop[2023-10-12] | 0.211 | 
-| 8 | MacBook Pro (13-inch, M1, 2020) | 0.26 | 
-| 9 | AWS c5.4xlarge Instance (Intel Xeon Platinum 8124M @ 3.399GHz, Ubuntu 20.04.6 LTS x86_64) | 0.317 | 
-| 10 | AMD Ryzen 9 5950X 16-Core of Desktop[2023-10-12] | 0.333 | 
-| 11 | NVIDIA Jetson AGX Orin (Cortex-A78AE 2.2 GHz x 12 + Ampere@1.3 GHz Tensor Cores 64) | 0.37 | 
-| 12 | AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8) | 0.37 | 
-| 13 | HUAWEI KunPeng 920 3211K (x24 cores) | 0.377 | 
-| 14 | HUAWEI KunPeng 920 2251K (x8 cores) | 0.418 | 
-| 15 | nVIDIA RTX A3000 of Notebook (6GB) | 0.434 | 
-| 16 | Intel(R) UHD Graphics 770 of Desktop[2023-10-12] | 0.473 | 
-| 17 | OrangePi5, Rockchip RK3588s (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.492 | 
-| 18 | Qualcomm SM8150-AC Snapdragon 855+ (Kyro485 2.96 GHz + 2.42 GHz x 3 + 1.80 GHz x 4 + Adreno 640) | 0.498 | 
-| 19 | Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.583 | 
-| 20 | NVIDIA Jetson Orin Nano | 0.641 | 
-| 21 | Station-M3/ROC-RK3588S-PC, Rockchip RK3588S (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz + Mali-G610) StationOS (Android) | 0.696 | 
-| 22 | NVIDIA Jetson AGX Xavier (Carmel 2.2 GHz x 8 + Volta Tensor Cores 64) | 0.809 | 
-| 23 | Loongson 3A6000 (LA664 2.5GHz * 4+4) | 0.86 | 
-| 24 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-11800H, NVIDIA GeForce RTX 3070 Laptop GPU) | 0.898 | 
-| 25 | Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8) (2.4GHz x 4) | 1 | 
-| 26 | Rockchip RK3588 (Cortex-A76 2.4GHz x 4 + Cortex-A55 1.8GHz x 4) | 1.01 | 
-| 27 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-7700K, NVIDIA GeForce GTX 1050 Ti) | 1.24 | 
-| 28 | Phytium FT-2000+/64 (FTC662 armv8 2.4GHz x 8) | 1.36 | 
-| 29 | AMD Ryzen Embedded V1605B (Zen 2.0 GHz ~ 3.6 GHz x 4 + Radeon Vega 8 1.1GHz 8CU) | 1.62 | 
-| 30 | Avaota Aim T527, Allwinner T527 (Cortex-A55 2.2GHz x 4 + Cortex-A55 1.8GHz x 4) | 1.75 | 
-| 31 | Loongson 3A5000 (LA464 2.5GHz * 4) | 1.79 | 
-| 32 | NVIDIA Jetson Nano | 1.82 | 
-| 33 | Qualcomm MSM8996 Pro Snapdragon 821 (Kyro 2.35GHz x 2 + Kyro 2.19GHz x 2) | 1.83 | 
-| 34 | AMD Ryzen Threadripper 3970X (Zen2 3.7 GHz ~ 4.5 GHz x 32) | 1.83 | 
-| 35 | Intel Celeron N5105 | 2.12 | 
-| 36 | MacBook Pro (15-inch, 2019) - 2.6GHz six cores Intel Core i7 && Radeon Pro 555X 4GB && Intel UHD Graphics 630 1536MB | 2.49 | 
-| 37 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 2.54 | 
-| 38 | Khadas VIM3, Amlogic A311D (Cortex-A73 2.2GHz x 4 + Cortex-A53 1.8GHz x 2) | 2.71 | 
-| 39 | Kirin 970 (Cortex-A73 2.4GHz x 4 + Cortex-A53 1.8GHz x 4) | 2.74 | 
-| 40 | Qualcomm MSM8998 Snapdragon 835 (Kyro 2.45GHz x 4 + Kyro 1.9GHz x 4 + Adreno 540) | 2.77 | 
-| 41 | Qualcomm MSM6150 Snapdragon 675 (Kyro460 2.0GHz x 2 + Kyro460 1.7GHz x 6 + Adreno 612) | 2.81 | 
-| 42 | Station P2, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) | 2.9 | 
-| 43 | Rock3A, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) ubuntu 20.04 | 2.93 | 
-| 44 | Qualcomm MSM8994 Snapdragon 810 (Cortex-A57 2.0GHz x 4 + Cortex-A53 1.55GHz x 4) | 2.95 | 
-| 45 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 3.2 | 
-| 46 | Raspberry Pi 4 Model B Broadcom BCM2711B0, Cortex-A72 (ARMv8) (1.8GHz x 4) | 3.88 | 
-| 47 | Qualcomm SDM660 Snapdragon 660 (Kyro260 2.2GHz x 4 + Kyro260 1.84GHz x 4 + Adreno 512) | 3.96 | 
-| 48 | Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 3.99 | 
-| 49 | OrangePi4 LTS, Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 4 | 
-| 50 | Phytium FT-2000/4 (FTC663 armv8 2.2GHz x 4) | 4.13 | 
-| 51 | RDK X3 Module (Cortex-A53 1.5GHz x 4) aarch64 | 4.6 | 
-| 52 | Station-M2/ROC-RK3566-PC, Rockchip RK3566 (Cortex-A55 1.8GHz x 4 + Mali-G52) StationOS (Android) | 4.92 | 
-| 53 | Rockchip RK3288-CG.W (Cortex-A17 1.8GHz x 4) | 5.21 | 
-| 54 | Intel Atom x5-Z8350 | 5.83 | 
-| 55 | Qualcomm MSM8916 Snapdragon 410 (Cortex-A53 1.2GHz x 4) | 5.93 | 
-| 56 | NanoPi R2S, Rockchip RK3328 (Cortex-A53 1.3GHz x 4) Armbian focal (21.05.1) aarch64 | 5.93 | 
-| 57 | EAIDK 310, Rockchip RK3228H (Cortex-A53 1.3GHz x 4) fedora-28 aarch64 | 6.53 | 
-| 58 | Raspberry Pi 3 Model B+ Broadcom BCM2837B0, Cortex-A53 (ARMv8) (1.4GHz x 4) | 7.71 | 
-| 59 | iPhone 5S (Apple A7 1.3GHz x 2) | 8.1 | 
-| 60 | Raspberry Pi 5 Broadcom BCM2712, VideoCore VII Graphics (Vulkan 1.2) | 10.3 | 
-| 61 | Raspberry Pi Zero 2 W Broadcom BCM2710A1, Cortex-A53 (ARMv8) (1.0GHz x 4) | 10.7 | 
-| 62 | Loongson 3A3000 (GS464E 1.45GHz * 4) | 12.6 | 
-| 63 | AXERA AX620A (Cortex-A7 1.0GHz * 4) | 14.8 | 
-| 64 | Loongson 2K1000 (GS264 1.0GHz x 2) | 19 | 
-| 65 | Loongson 2K1000LA (LA264 1.0GHz * 2) | 19 | 
-| 66 | Banana Pi M2 Zero 2 AllWinner H2+, Cortex-A7 (ARMv7-A) (1.2GHz x 4) | 21.1 | 
-| 67 | Freescale i.MX7 Dual (Cortex A7 1.0GHz x 2) | 21.6 | 
-| 68 | Sunway SW831 (sw_64 2.5GHz * 8) | 31.2 | 
-| 69 | Intel Celeron M 420 (Yonah 1.60 GHz x 1) | 33.4 | 
-| 70 | HiSilicon Hi3519V101 (Cortex-A17 1.2GHz x 1) | 34 | 
-| 71 | Z7-Lite 7020 XC7Z020CLG400-2 (Cortex-A9 766MHz x 2) | 35.2 | 
-| 72 | Amlogic S805 (Cortex-A5, 4 × 1.536GHz) | 46 | 
-| 73 | VisionFive2 , JH7110 (SiFive-U74(RV64GC) 1.5GHz x 4) riscv64 | 52 | 
-| 74 | T-Head TH1520 (C910V, 1.848 GHz x 4 + BXM-4-64 PowerVR) | 60.5 | 
-| 75 | Sunway SW421 (sw_64 1.7GHz * 4) | 87.7 | 
-| 76 | Ingenic T40XP Xburst2 Core X2 1.4Ghz (without MSA) | 132 | 
+| 1 | NVIDIA Quadro RTX 8000 (TU102 SM x 72 + Tensor Core x 576) | 0.147 | 
+| 2 | nVIDIA RTX2080 of Desktop | 0.15 | 
+| 3 | NVIDIA GeForce RTX 3060 Ti of Desktop[2023-10-12] | 0.18 | 
+| 4 | nVIDIA RTX2060 of Notebook | 0.198 | 
+| 5 | Intel® Core™ i7-13700K of Desktop[2023-10-12] | 0.255 | 
+| 6 | AMD Radeon RX 6900 XT of Desktop[2023-10-12] | 0.275 | 
+| 7 | NVIDIA RTX3090 (GA102 SM x 82 + Tensor Core 328) | 0.277 | 
+| 8 | MediaTek Dimensity 9300 (MT6989) (Cortex-X4 3.25 GHz + 2.85 GHz x 3 + Cortex-A720 2.0 GHz x 4 + Mali-G720-Immortalis MC12) | 0.309 | 
+| 9 | MacBook Pro (13-inch, M1, 2020) | 0.346 | 
+| 10 | AWS c5.4xlarge Instance | 0.418 | 
+| 11 | AMD Ryzen 9 5950X 16-Core of Desktop[2023-10-12] | 0.427 | 
+| 12 | Qualcomm SM8550-AB Snapdragon 8 Gen 2 (Kyro 3.20 GHz + 2.8 GHz x 2 + 2.80 GHz x 2 + 2.00 GHz * 3 + Adreno 740) | 0.45 | 
+| 13 | AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8) | 0.478 | 
+| 14 | HUAWEI KunPeng 920 3211K (x24 cores) | 0.482 | 
+| 15 | NVIDIA Jetson AGX Orin (Cortex-A78AE 2.2 GHz x 12 + Ampere@1.3 GHz Tensor Cores 64) | 0.485 | 
+| 16 | HUAWEI KunPeng 920 2251K (x8 cores) | 0.54 | 
+| 17 | nVIDIA RTX A3000 of Notebook (6GB) | 0.577 | 
+| 18 | Intel(R) UHD Graphics 770 of Desktop[2023-10-12] | 0.593 | 
+| 19 | OrangePi5, Rockchip RK3588s (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.642 | 
+| 20 | Qualcomm SM8150-AC Snapdragon 855+ (Kyro485 2.96 GHz + 2.42 GHz x 3 + 1.80 GHz x 4 + Adreno 640) | 0.665 | 
+| 21 | Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.753 | 
+| 22 | NVIDIA Jetson Orin Nano | 0.819 | 
+| 23 | Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8) (2.4GHz x 4) | 1 | 
+| 24 | Station-M3/ROC-RK3588S-PC, Rockchip RK3588S (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz + Mali-G610) StationOS (Android) | 1 | 
+| 25 | NVIDIA Jetson AGX Xavier (Carmel 2.2 GHz x 8 + Volta Tensor Cores 64) | 1.05 | 
+| 26 | Loongson 3A6000 (LA664 2.5GHz * 4+4) | 1.11 | 
+| 27 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-11800H, NVIDIA GeForce RTX 3070 Laptop GPU) | 1.19 | 
+| 28 | Rockchip RK3588 (Cortex-A76 2.4GHz x 4 + Cortex-A55 1.8GHz x 4) | 1.35 | 
+| 29 | NVIDIA Jetson TX2 NX(NV-Denver2 2.0Ghz x 2 +  Cortex-A57 2.0Ghz x 4 + 256-core NVIDIA Pascal iGPU) | 1.59 | 
+| 30 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-7700K, NVIDIA GeForce GTX 1050 Ti) | 1.66 | 
+| 31 | Phytium FT-2000+/64 (FTC662 armv8 2.4GHz x 8) | 1.75 | 
+| 32 | AMD Ryzen Threadripper 3970X (Zen2 3.7 GHz ~ 4.5 GHz x 32) | 2.19 | 
+| 33 | AMD Ryzen Embedded V1605B (Zen 2.0 GHz ~ 3.6 GHz x 4 + Radeon Vega 8 1.1GHz 8CU) | 2.23 | 
+| 34 | Avaota Aim T527, Allwinner T527 (Cortex-A55 2.2GHz x 4 + Cortex-A55 1.8GHz x 4) | 2.28 | 
+| 35 | Loongson 3A5000 (LA464 2.5GHz * 4) | 2.31 | 
+| 36 | Qualcomm MSM8996 Pro Snapdragon 821 (Kyro 2.35GHz x 2 + Kyro 2.19GHz x 2) | 2.37 | 
+| 37 | NVIDIA Jetson Nano | 2.44 | 
+| 38 | Intel Celeron N5105 | 2.8 | 
+| 39 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 3.24 | 
+| 40 | Khadas VIM3, Amlogic A311D (Cortex-A73 2.2GHz x 4 + Cortex-A53 1.8GHz x 2) | 3.48 | 
+| 41 | Kirin 970 (Cortex-A73 2.4GHz x 4 + Cortex-A53 1.8GHz x 4) | 3.58 | 
+| 42 | Qualcomm MSM8998 Snapdragon 835 (Kyro 2.45GHz x 4 + Kyro 1.9GHz x 4 + Adreno 540) | 3.63 | 
+| 43 | MacBook Pro (15-inch, 2019) - 2.6GHz six cores Intel Core i7 && Radeon Pro 555X 4GB && Intel UHD Graphics 630 1536MB | 3.75 | 
+| 44 | Qualcomm MSM6150 Snapdragon 675 (Kyro460 2.0GHz x 2 + Kyro460 1.7GHz x 6 + Adreno 612) | 3.75 | 
+| 45 | Qualcomm MSM8994 Snapdragon 810 (Cortex-A57 2.0GHz x 4 + Cortex-A53 1.55GHz x 4) | 3.82 | 
+| 46 | Station P2, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) | 3.85 | 
+| 47 | Rock3A, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) ubuntu 20.04 | 3.86 | 
+| 48 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 4.08 | 
+| 49 | Radxa Zero 3W, Cortex-A55 (ARMv82) (1.416 GHz x 4) | 4.5 | 
+| 50 | Raspberry Pi 4 Model B Broadcom BCM2711B0, Cortex-A72 (ARMv8) (1.8GHz x 4) | 4.95 | 
+| 51 | OrangePi4 LTS, Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 5.11 | 
+| 52 | Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 5.16 | 
+| 53 | PhytiumPi, Phytium E2000 (FTC664@1.8GHz x2 + FTC310@1.5GHz x2) | 5.16 | 
+| 54 | Qualcomm SDM660 Snapdragon 660 (Kyro260 2.2GHz x 4 + Kyro260 1.84GHz x 4 + Adreno 512) | 5.26 | 
+| 55 | Phytium FT-2000/4 (FTC663 armv8 2.2GHz x 4) | 5.27 | 
+| 56 | RDK X3 Module (Cortex-A53 1.5GHz x 4) aarch64 | 5.88 | 
+| 57 | Station-M2/ROC-RK3566-PC, Rockchip RK3566 (Cortex-A55 1.8GHz x 4 + Mali-G52) StationOS (Android) | 6.51 | 
+| 58 | Rockchip RK3288-CG.W (Cortex-A17 1.8GHz x 4) | 6.66 | 
+| 59 | Qualcomm MSM8916 Snapdragon 410 (Cortex-A53 1.2GHz x 4) | 7.63 | 
+| 60 | NanoPi R2S, Rockchip RK3328 (Cortex-A53 1.3GHz x 4) Armbian focal (21.05.1) aarch64 | 7.66 | 
+| 61 | Intel Atom x5-Z8350 | 7.74 | 
+| 62 | Loongson 2K2000 (LA364 1.5GHz * 2 with lsx) | 8.23 | 
+| 63 | EAIDK 310, Rockchip RK3228H (Cortex-A53 1.3GHz x 4) fedora-28 aarch64 | 8.34 | 
+| 64 | OrangePi Zero 2, Allwinner H616 (Cortex-A53 1.5GHz x 4) | 9.51 | 
+| 65 | Raspberry Pi 3 Model B+ Broadcom BCM2837B0, Cortex-A53 (ARMv8) (1.4GHz x 4) | 9.87 | 
+| 66 | iPhone 5S (Apple A7 1.3GHz x 2) | 11 | 
+| 67 | MYIR RemiPi,Renesas RZG2L(Cortex-A55 1.5GHz x 2) | 11.9 | 
+| 68 | Raspberry Pi 5 Broadcom BCM2712, VideoCore VII Graphics (Vulkan 1.2) | 12.5 | 
+| 69 | Raspberry Pi Zero 2 W Broadcom BCM2710A1, Cortex-A53 (ARMv8) (1.0GHz x 4) | 13.7 | 
+| 70 | Xeon Phi 3120A (1.10 GHz 57-core 228-thread) | 15.1 | 
+| 71 | Loongson 3A3000 (GS464E 1.45GHz * 4) | 16.3 | 
+| 72 | AXERA AX620A (Cortex-A7 1.0GHz * 4) | 18.8 | 
+| 73 | Loongson 2K1000LA (LA264 1.0GHz * 2) | 24.4 | 
+| 74 | Loongson 2K1000 (GS264 1.0GHz x 2) | 24.8 | 
+| 75 | Freescale i.MX7 Dual (Cortex A7 1.0GHz x 2) | 26.7 | 
+| 76 | Banana Pi M2 Zero 2 AllWinner H2+, Cortex-A7 (ARMv7-A) (1.2GHz x 4) | 26.8 | 
+| 77 | HiSilicon Hi3519V101 (Cortex-A17 1.2GHz x 1) | 36.2 | 
+| 78 | Sunway SW831 (sw_64 2.5GHz * 8) | 40.7 | 
+| 79 | Z7-Lite 7020 XC7Z020CLG400-2 (Cortex-A9 766MHz x 2) | 43.2 | 
+| 80 | Intel Celeron M 420 (Yonah 1.60 GHz x 1) | 43.9 | 
+| 81 | Amlogic S805 (Cortex-A5, 4 × 1.536GHz) | 45.9 | 
+| 82 | VisionFive2 , JH7110 (SiFive-U74(RV64GC) 1.5GHz x 4) riscv64 with PowerVR B-Series BXE-4-32 | 72.4 | 
+| 83 | T-Head TH1520 (C910V, 1.848 GHz x 4 + BXM-4-64 PowerVR) | 83.3 | 
+| 84 | Sunway SW421 (sw_64 1.7GHz * 4) | 116 | 
+| 85 | Ingenic T40XP Xburst2 Core X2 1.4Ghz (without MSA) | 165 | 
diff --git a/build-android.cmd b/build-android.cmd
index 0c4262a37d0..b621dae6c1a 100644
--- a/build-android.cmd
+++ b/build-android.cmd
@@ -2,40 +2,22 @@
 @ECHO OFF
 @SETLOCAL
 @SET ANDROID_NDK=<your-ndk-root_path, such as"E:\android-ndk-r18b">
-@SET VULKAN_SDK=<your-vulkan-toolkit_path, such as"D:\VulkanSDK\1.1.106.0\Bin">
 
 :: Set ninja.exe
 :: @SET NINJA_EXE=<your-ninja-exe_path, such as"D:\android\sdk\cmake\3.10.2.4988404\bin\ninja.exe">
 
 :: android armv7
-mkdir build-android-armv7
-pushd build-android-armv7
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-21 ..
-:: cmake -G Ninja -DCMAKE_TOOLCHAIN_FILE="%ANDROID_NDK%/build/cmake/android.toolchain.cmake"  -DCMAKE_MAKE_PROGRAM=%NINJA_EXE%  -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON  -DANDROID_PLATFORM=android-21 .. 
-cmake --build . --parallel %NUMBER_OF_PROCESSORS%
-cmake --build . --target install
-popd
-
-:: android armv7 vulkan
 mkdir build-android-armv7-vulkan
 pushd build-android-armv7-vulkan
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
 cmake --build . --parallel %NUMBER_OF_PROCESSORS%
 cmake --build . --target install
 popd
 
 :: android aarch64
-mkdir build-android-aarch64
-pushd build-android-aarch64
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 ..
-cmake --build . --parallel %NUMBER_OF_PROCESSORS%
-cmake --build . --target install
-popd
-
-:: android aarch64 vulkan
 mkdir build-android-aarch64-vulkan
 pushd build-android-aarch64-vulkan
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
 cmake --build . --parallel %NUMBER_OF_PROCESSORS%
 cmake --build . --target install
 popd
@@ -43,7 +25,7 @@ popd
 :: android x86
 mkdir build-android-x86
 pushd build-android-x86
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 ..
+cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
 cmake --build . --parallel %NUMBER_OF_PROCESSORS%
 cmake --build . --target install
 popd
@@ -51,7 +33,7 @@ popd
 :: android x86_64
 mkdir build-android-x86_64
 pushd build-android-x86_64
-cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 ..
+cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
 cmake --build . --parallel %NUMBER_OF_PROCESSORS%
 cmake --build . --target install
 popd
diff --git a/build.sh b/build.sh
index 20a96eae2d3..754aaf8a4cd 100755
--- a/build.sh
+++ b/build.sh
@@ -1,9 +1,17 @@
 #!/usr/bin/env bash
 
+##### android armv7 without neon
+mkdir -p build-android-armv7-without-neon
+pushd build-android-armv7-without-neon
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
+make -j4
+make install
+popd
+
 ##### android armv7
 mkdir -p build-android-armv7
 pushd build-android-armv7
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 ..
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
 make -j4
 make install
 popd
@@ -11,15 +19,7 @@ popd
 ##### android aarch64
 mkdir -p build-android-aarch64
 pushd build-android-aarch64
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 ..
-make -j4
-make install
-popd
-
-##### android armv7 without neon
-mkdir -p build-android-armv7-without-neon
-pushd build-android-armv7-without-neon
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-19 ..
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
 make -j4
 make install
 popd
@@ -27,7 +27,7 @@ popd
 ##### android x86
 mkdir -p build-android-x86
 pushd build-android-x86
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 ..
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
 make -j4
 make install
 popd
@@ -35,39 +35,7 @@ popd
 ##### android x86_64
 mkdir -p build-android-x86_64
 pushd build-android-x86_64
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 ..
-make -j4
-make install
-popd
-
-##### android armv7 vulkan
-mkdir -p build-android-armv7-vulkan
-pushd build-android-armv7-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
-##### android aarch64 vulkan
-mkdir -p build-android-aarch64-vulkan
-pushd build-android-aarch64-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
-##### android x86 vulkan
-mkdir -p build-android-x86-vulkan
-pushd build-android-x86-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
-##### android x86_64 vulkan
-mkdir -p build-android-x86_64-vulkan
-pushd build-android-x86_64-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
 make -j4
 make install
 popd
@@ -144,70 +112,6 @@ make -j4
 make install
 popd
 
-##### ios armv7 arm64
-mkdir -p build-ios
-pushd build-ios
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc.toolchain.cmake -DENABLE_BITCODE=OFF ..
-make -j4
-make install
-popd
-
-##### ios armv7 arm64 bitcode
-mkdir -p build-ios-bitcode
-pushd build-ios-bitcode
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc.toolchain.cmake -DENABLE_BITCODE=ON ..
-make -j4
-make install
-popd
-
-##### ios simulator i386 x86_64
-mkdir -p build-ios-sim
-pushd build-ios-sim
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc.toolchain.cmake -DENABLE_BITCODE=OFF ..
-make -j4
-make install
-popd
-
-##### ios simulator i386 x86_64 bitcode
-mkdir -p build-ios-sim-bitcode
-pushd build-ios-sim-bitcode
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc.toolchain.cmake -DENABLE_BITCODE=ON ..
-make -j4
-make install
-popd
-
-##### ios arm64 vulkan
-mkdir -p build-ios-vulkan
-pushd build-ios-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc-arm64.toolchain.cmake -DENABLE_BITCODE=OFF -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
-##### ios arm64 vulkan bitcode
-mkdir -p build-ios-vulkan-bitcode
-pushd build-ios-vulkan-bitcode
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iosxc-arm64.toolchain.cmake -DENABLE_BITCODE=ON -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
-##### ios simulator x86_64 vulkan
-mkdir -p build-ios-sim-vulkan
-pushd build-ios-sim-vulkan
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc-x64.toolchain.cmake -DENABLE_BITCODE=OFF -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
-make
-make install
-popd
-
-##### ios simulator x86_64 vulkan bitcode
-mkdir -p build-ios-sim-vulkan-bitcode
-pushd build-ios-sim-vulkan-bitcode
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/iossimxc-x64.toolchain.cmake -DENABLE_BITCODE=ON -DVulkan_INCLUDE_DIR=${VULKAN_SDK}/MoltenVK/include -DVulkan_LIBRARY=${VULKAN_SDK}/MoltenVK/iOS/MoltenVK.framework/MoltenVK -DNCNN_VULKAN=ON ..
-make -j4
-make install
-popd
-
 ##### MacOS
 mkdir -p build-mac
 pushd build-mac
diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
index e6c74fec5eb..4eeedb010c7 100644
--- a/cmake/ncnn_add_layer.cmake
+++ b/cmake/ncnn_add_layer.cmake
@@ -31,35 +31,14 @@ macro(ncnn_add_arch_opt_layer class NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CF
         list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER} ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE})
 
         # generate layer_declaration and layer_registry file
-        set(layer_declaration "${layer_declaration}#include \"layer/${name}.h\"\n")
-        set(layer_declaration_class "class ${class}_final_${NCNN_TARGET_ARCH_OPT} : virtual public ${class}")
-        set(create_pipeline_content "        { int ret = ${class}::create_pipeline(opt); if (ret) return ret; }\n")
-        set(destroy_pipeline_content "        { int ret = ${class}::destroy_pipeline(opt); if (ret) return ret; }\n")
-
-        if(WITH_LAYER_${name}_vulkan)
-            set(layer_declaration "${layer_declaration}#include \"layer/vulkan/${name}_vulkan.h\"\n")
-            set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_vulkan")
-            set(create_pipeline_content "${create_pipeline_content}        if (vkdev) { int ret = ${class}_vulkan::create_pipeline(opt); if (ret) return ret; }\n")
-            set(destroy_pipeline_content "        if (vkdev) { int ret = ${class}_vulkan::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
-        endif()
-
         set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.h\"\n")
-        set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}")
-        set(create_pipeline_content "${create_pipeline_content}        { int ret = ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}::create_pipeline(opt); if (ret) return ret; }\n")
-        set(destroy_pipeline_content "        { int ret = ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
-
-        set(layer_declaration "${layer_declaration}namespace ncnn {\n${layer_declaration_class}\n{\n")
-        set(layer_declaration "${layer_declaration}public:\n")
-        set(layer_declaration "${layer_declaration}    virtual int create_pipeline(const Option& opt) {\n${create_pipeline_content}        return 0;\n    }\n")
-        set(layer_declaration "${layer_declaration}    virtual int destroy_pipeline(const Option& opt) {\n${destroy_pipeline_content}        return 0;\n    }\n")
-        set(layer_declaration "${layer_declaration}};\n")
-        set(layer_declaration "${layer_declaration}DEFINE_LAYER_CREATOR(${class}_final_${NCNN_TARGET_ARCH_OPT})\n} // namespace ncnn\n\n")
-
-        set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_final_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#else\n{${class}_final_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#endif\n")
+        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}) }\n")
+
+        set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#else\n{${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#endif\n")
     else()
         # no isa optimized version
         if(WITH_LAYER_${name})
-            set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_final_layer_creator},\n#else\n{${class}_final_layer_creator},\n#endif\n")
+            set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
         else()
             set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
         endif()
@@ -110,18 +89,21 @@ macro(ncnn_add_layer class)
     # generate layer_declaration and layer_registry file
     if(WITH_LAYER_${name})
         set(layer_declaration "${layer_declaration}#include \"layer/${name}.h\"\n")
-        set(layer_declaration_class "class ${class}_final : virtual public ${class}")
-        set(create_pipeline_content "        { int ret = ${class}::create_pipeline(opt); if (ret) return ret; }\n")
-        set(destroy_pipeline_content "        { int ret = ${class}::destroy_pipeline(opt); if (ret) return ret; }\n")
+        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}) }\n")
 
         source_group ("sources\\\\layers" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp")
     endif()
 
+    if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
+        set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h\"\n")
+        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}) }\n")
+
+        source_group ("sources\\\\layers\\\\${NCNN_TARGET_ARCH}" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp")
+    endif()
+
     if(WITH_LAYER_${name}_vulkan)
         set(layer_declaration "${layer_declaration}#include \"layer/vulkan/${name}_vulkan.h\"\n")
-        set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_vulkan")
-        set(create_pipeline_content "${create_pipeline_content}        if (vkdev) { int ret = ${class}_vulkan::create_pipeline(opt); if (ret) return ret; }\n")
-        set(destroy_pipeline_content "        if (vkdev) { int ret = ${class}_vulkan::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
+        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_vulkan) }\n")
 
         file(GLOB_RECURSE NCNN_SHADER_SRCS "layer/vulkan/shader/${name}.comp")
         file(GLOB_RECURSE NCNN_SHADER_SUBSRCS "layer/vulkan/shader/${name}_*.comp")
@@ -133,28 +115,22 @@ macro(ncnn_add_layer class)
         source_group ("sources\\\\layers\\\\vulkan" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/${name}_vulkan.cpp")
     endif()
 
-    if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
-        set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h\"\n")
-        set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_${NCNN_TARGET_ARCH}")
-        set(create_pipeline_content "${create_pipeline_content}        { int ret = ${class}_${NCNN_TARGET_ARCH}::create_pipeline(opt); if (ret) return ret; }\n")
-        set(destroy_pipeline_content "        { int ret = ${class}_${NCNN_TARGET_ARCH}::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
-
-        source_group ("sources\\\\layers\\\\${NCNN_TARGET_ARCH}" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp")
+    if(WITH_LAYER_${name})
+        set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
+    else()
+        set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
     endif()
 
-    if(WITH_LAYER_${name})
-        set(layer_declaration "${layer_declaration}namespace ncnn {\n${layer_declaration_class}\n{\n")
-        set(layer_declaration "${layer_declaration}public:\n")
-        set(layer_declaration "${layer_declaration}    virtual int create_pipeline(const Option& opt) {\n${create_pipeline_content}        return 0;\n    }\n")
-        set(layer_declaration "${layer_declaration}    virtual int destroy_pipeline(const Option& opt) {\n${destroy_pipeline_content}        return 0;\n    }\n")
-        set(layer_declaration "${layer_declaration}};\n")
-        set(layer_declaration "${layer_declaration}DEFINE_LAYER_CREATOR(${class}_final)\n} // namespace ncnn\n\n")
+    if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
+        set(layer_registry_arch "${layer_registry_arch}#if NCNN_STRING\n{\"${class}\", ${class}_${NCNN_TARGET_ARCH}_layer_creator},\n#else\n{${class}_${NCNN_TARGET_ARCH}_layer_creator},\n#endif\n")
+    else()
+        set(layer_registry_arch "${layer_registry_arch}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
     endif()
 
-    if(WITH_LAYER_${name})
-        set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", ${class}_final_layer_creator},\n#else\n{${class}_final_layer_creator},\n#endif\n")
+    if(WITH_LAYER_${name}_vulkan)
+        set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", ${class}_vulkan_layer_creator},\n#else\n{${class}_vulkan_layer_creator},\n#endif\n")
     else()
-        set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
+        set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
     endif()
 
     if(NCNN_TARGET_ARCH STREQUAL "x86")
diff --git a/cmake/ncnn_add_shader.cmake b/cmake/ncnn_add_shader.cmake
index 8006241bc05..76680f4ca81 100644
--- a/cmake/ncnn_add_shader.cmake
+++ b/cmake/ncnn_add_shader.cmake
@@ -1,7 +1,7 @@
 
 macro(ncnn_add_shader NCNN_SHADER_SRC)
     get_filename_component(NCNN_SHADER_SRC_NAME_WE ${NCNN_SHADER_SRC} NAME_WE)
-    set(NCNN_SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${NCNN_SHADER_SRC_NAME_WE}.comp.hex.h)
+    set(NCNN_SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${NCNN_SHADER_SRC_NAME_WE}.comp.hex.h)
 
     add_custom_command(
         OUTPUT ${NCNN_SHADER_COMP_HEADER}
@@ -13,7 +13,7 @@ macro(ncnn_add_shader NCNN_SHADER_SRC)
     set_source_files_properties(${NCNN_SHADER_COMP_HEADER} PROPERTIES GENERATED TRUE)
 
     get_filename_component(NCNN_SHADER_COMP_HEADER_NAME ${NCNN_SHADER_COMP_HEADER} NAME)
-    string(APPEND layer_shader_spv_data "#include \"${NCNN_SHADER_COMP_HEADER_NAME}\"\n")
+    string(APPEND layer_shader_spv_data "#include \"layer/vulkan/shader/${NCNN_SHADER_COMP_HEADER_NAME}\"\n")
 
     get_filename_component(NCNN_SHADER_SRC_NAME_WE ${NCNN_SHADER_SRC} NAME_WE)
     string(APPEND layer_shader_registry "{${NCNN_SHADER_SRC_NAME_WE}_comp_data,sizeof(${NCNN_SHADER_SRC_NAME_WE}_comp_data)},\n")
diff --git a/cmake/ncnn_generate_shader_comp_header.cmake b/cmake/ncnn_generate_shader_comp_header.cmake
index a41b6328d8d..79f7c1eff3b 100644
--- a/cmake/ncnn_generate_shader_comp_header.cmake
+++ b/cmake/ncnn_generate_shader_comp_header.cmake
@@ -18,8 +18,8 @@ string(REGEX REPLACE "\n\n" "\n" comp_data "${comp_data}")
 get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
 
 # text to hex
-file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.text2hex.txt "${comp_data}")
-file(READ ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.text2hex.txt comp_data_hex HEX)
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${SHADER_SRC_NAME_WE}.text2hex.txt "${comp_data}")
+file(READ ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${SHADER_SRC_NAME_WE}.text2hex.txt comp_data_hex HEX)
 string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," comp_data_hex ${comp_data_hex})
 string(FIND "${comp_data_hex}" "," tail_comma REVERSE)
 string(SUBSTRING "${comp_data_hex}" 0 ${tail_comma} comp_data_hex)
diff --git a/cmake/ncnn_generate_shader_spv_header.cmake b/cmake/ncnn_generate_shader_spv_header.cmake
deleted file mode 100644
index 93649daed92..00000000000
--- a/cmake/ncnn_generate_shader_spv_header.cmake
+++ /dev/null
@@ -1,581 +0,0 @@
-
-function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS SHADER_SRC)
-
-    # fp32
-    get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
-
-    set(SHADER_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=vec2 -Dsfpvec4=vec4 -Dsfpvec8=mat2x4 -Dsfpmat4=mat4
-             -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
-             "-D buffer_ld2(buf,i)=buf[i]"
-             "-D buffer_st2(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=buf[i]"
-             "-D buffer_st4(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=buf[i]"
-             "-D buffer_st8(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
-             "-D sfp2afpmat4(v)=v"
-             "-D afp2sfpmat4(v)=v"
-             "-D psc(x)=(x==0?p.x:x)"
-             -V -s -x -o ${SHADER_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # fp16 packed
-    set(SHADER_fp16p_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16p")
-
-    set(SHADER_fp16p_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16p_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_fp16p_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
-             -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4  -Dafpvec8=mat2x4 -Dafpmat4=mat4
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"
-             "-D buffer_ld2(buf,i)=unpackHalf2x16(buf[i])"
-             "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(v)}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"
-             "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"
-             "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_fp16_packed=1
-             -V -s -x -o ${SHADER_fp16p_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_fp16p_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_fp16p_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # fp16 packed + fp16 arithmetic
-    set(SHADER_fp16pa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16pa")
-
-    set(SHADER_fp16pa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16pa_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_fp16pa_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
-             -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4  -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
-             "-D buffer_ld1(buf,i)=float16_t(buf[i])"
-             "-D buffer_st1(buf,i,v)={buf[i]=float(v);}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"
-             "-D buffer_ld2(buf,i)=f16vec2(unpackHalf2x16(buf[i]))"
-             "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(vec2(v))}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"
-             "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"
-             "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_fp16_packed=1 -DNCNN_fp16_arithmetic=1
-             -V -s -x -o ${SHADER_fp16pa_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_fp16pa_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_fp16pa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # fp16 storage
-    set(SHADER_fp16s_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16s")
-
-    set(SHADER_fp16s_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16s_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_fp16s_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4
-             -Dafp=float     -Dafpvec2=vec2    -Dafpvec4=vec4    -Dafpvec8=mat2x4 -Dafpmat4=mat4
-             "-D buffer_ld1(buf,i)=float(buf[i])"
-             "-D buffer_st1(buf,i,v)={buf[i]=float16_t(v);}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"
-             "-D buffer_ld2(buf,i)=vec2(buf[i])"
-             "-D buffer_st2(buf,i,v)={buf[i]=f16vec2(v);}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=vec4(buf[i])"
-             "-D buffer_st4(buf,i,v)={buf[i]=f16vec4(v);}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"
-             "-D buffer_ld8(buf,i)=mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"
-             "-D buffer_st8(buf,i,v)={buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_fp16_storage=1
-             -V -s -x -o ${SHADER_fp16s_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_fp16s_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_fp16s_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # fp16 storage + fp16 arithmetic
-    set(SHADER_fp16sa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16sa")
-
-    set(SHADER_fp16sa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16sa_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_fp16sa_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpvec8=f16mat2x4 -Dsfpmat4=f16mat4
-             -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
-             "-D buffer_ld2(buf,i)=buf[i]"
-             "-D buffer_st2(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=buf[i]"
-             "-D buffer_st4(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=buf[i]"
-             "-D buffer_st8(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
-             "-D sfp2afpmat4(v)=v"
-             "-D afp2sfpmat4(v)=v"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_fp16_storage=1 -DNCNN_fp16_arithmetic=1
-             -V -s -x -o ${SHADER_fp16sa_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_fp16sa_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_fp16sa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # image + fp32
-    set(SHADER_image_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image")
-
-    set(SHADER_image_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_image_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=vec2 -Dsfpvec4=vec4 -Dsfpvec8=mat2x4 -Dsfpmat4=mat4
-             -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4
-
-             -Dimfmtc1=r32f -Dimfmtc4=rgba32f
-             -Dunfp=highp
-
-             "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
-             "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
-             "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
-             "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
-             "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
-             "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
-             "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
-             "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
-             "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
-             "-D buffer_ld2(buf,i)=buf[i]"
-             "-D buffer_st2(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=buf[i]"
-             "-D buffer_st4(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=buf[i]"
-             "-D buffer_st8(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
-
-             "-D sfp2afpmat4(v)=v"
-             "-D afp2sfpmat4(v)=v"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_image_shader=1
-             -V -s -x -o ${SHADER_image_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_image_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_image_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # image + fp16p
-    set(SHADER_image_fp16p_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16p")
-
-    set(SHADER_image_fp16p_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_image_fp16p_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
-             -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4  -Dafpvec8=mat2x4 -Dafpmat4=mat4
-
-             -Dimfmtc1=r32f -Dimfmtc4=rgba16f
-             -Dunfp=mediump
-
-             "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
-             "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
-             "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
-             "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
-             "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
-             "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
-             "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
-             "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
-             "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"
-             "-D buffer_ld2(buf,i)=unpackHalf2x16(buf[i])"
-             "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(v)}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"
-             "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"
-             "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
-
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_image_shader=1 -DNCNN_fp16_packed=1
-             -V -s -x -o ${SHADER_image_fp16p_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_image_fp16p_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_image_fp16p_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # image + fp16p + fp16a
-    set(SHADER_image_fp16pa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16pa")
-
-    set(SHADER_image_fp16pa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16pa_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_image_fp16pa_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
-             -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4  -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
-
-             -Dimfmtc1=r32f -Dimfmtc4=rgba16f
-             -Dunfp=mediump
-
-             "-D image1d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image2d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image3d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image2d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image3d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld8(tex,p)=f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
-             "-D image2d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
-             "-D image3d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
-             "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
-             "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
-             "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
-             "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
-             "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
-             "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
-             "-D buffer_ld1(buf,i)=float16_t(buf[i])"
-             "-D buffer_st1(buf,i,v)={buf[i]=float(v);}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"
-             "-D buffer_ld2(buf,i)=f16vec2(unpackHalf2x16(buf[i]))"
-             "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(vec2(v))}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"
-             "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"
-             "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
-
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_image_shader=1 -DNCNN_fp16_packed=1 -DNCNN_fp16_arithmetic=1
-             -V -s -x -o ${SHADER_image_fp16pa_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_image_fp16pa_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_image_fp16pa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # image + fp16s
-    set(SHADER_image_fp16s_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16s")
-
-    set(SHADER_image_fp16s_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_image_fp16s_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4
-             -Dafp=float     -Dafpvec2=vec2    -Dafpvec4=vec4    -Dafpvec8=mat2x4 -Dafpmat4=mat4
-
-             -Dimfmtc1=r16f -Dimfmtc4=rgba16f
-             -Dunfp=mediump
-
-             "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
-             "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
-             "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
-             "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
-             "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
-             "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
-             "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
-             "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
-             "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
-             "-D buffer_ld1(buf,i)=float(buf[i])"
-             "-D buffer_st1(buf,i,v)={buf[i]=float16_t(v);}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"
-             "-D buffer_ld2(buf,i)=vec2(buf[i])"
-             "-D buffer_st2(buf,i,v)={buf[i]=f16vec2(v);}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=vec4(buf[i])"
-             "-D buffer_st4(buf,i,v)={buf[i]=f16vec4(v);}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"
-             "-D buffer_ld8(buf,i)=mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"
-             "-D buffer_st8(buf,i,v)={buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"
-
-             "-D sfp2afpmat4(v)=v"
-             "-D afp2sfpmat4(v)=v"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_image_shader=1 -DNCNN_fp16_storage=1
-             -V -s -x -o ${SHADER_image_fp16s_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_image_fp16s_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_image_fp16s_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # image + fp16s + fp16a
-    set(SHADER_image_fp16sa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16sa")
-
-    set(SHADER_image_fp16sa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16sa_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_image_fp16sa_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpvec8=f16mat2x4 -Dsfpmat4=f16mat4
-             -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
-
-             -Dimfmtc1=r16f -Dimfmtc4=rgba16f
-             -Dunfp=mediump
-
-             "-D image1d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image2d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image3d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image1d_st1(img,p,v)={f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"
-             "-D image2d_st1(img,p,v)={f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"
-             "-D image3d_st1(img,p,v)={f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"
-             "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image2d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image3d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image1d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
-             "-D image2d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
-             "-D image3d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
-             "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld8(tex,p)=f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
-             "-D image2d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
-             "-D image3d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
-             "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,vec4(v[0]));imageStore(img,(p)*2+1,vec4(v[1]));}"
-             "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}"
-             "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}"
-             "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
-             "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
-             "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
-             "-D buffer_ld2(buf,i)=buf[i]"
-             "-D buffer_st2(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=buf[i]"
-             "-D buffer_st4(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=buf[i]"
-             "-D buffer_st8(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
-             "-D sfp2afpmat4(v)=v"
-             "-D afp2sfpmat4(v)=v"
-
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_image_shader=1 -DNCNN_fp16_storage=1 -DNCNN_fp16_arithmetic=1
-             -V -s -x -o ${SHADER_image_fp16sa_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_image_fp16sa_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_image_fp16sa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    set(LOCAL_SHADER_SPV_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.h)
-
-    file(WRITE ${LOCAL_SHADER_SPV_HEADER}
-        "static const uint32_t ${SHADER_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_fp16p_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16p_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_fp16pa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16pa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_fp16sa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16sa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_image_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_image_fp16p_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_image_fp16pa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16pa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_image_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_image_fp16sa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16sa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-    )
-
-    set_source_files_properties(${LOCAL_SHADER_SPV_HEADER} PROPERTIES GENERATED TRUE)
-
-    set(LOCAL_SHADER_SPV_HEX_HEADERS
-        ${SHADER_SPV_HEX_FILE}
-        ${SHADER_fp16p_SPV_HEX_FILE}
-        ${SHADER_fp16pa_SPV_HEX_FILE}
-        ${SHADER_fp16s_SPV_HEX_FILE}
-        ${SHADER_fp16sa_SPV_HEX_FILE}
-        ${SHADER_image_SPV_HEX_FILE}
-        ${SHADER_image_fp16p_SPV_HEX_FILE}
-        ${SHADER_image_fp16pa_SPV_HEX_FILE}
-        ${SHADER_image_fp16s_SPV_HEX_FILE}
-        ${SHADER_image_fp16sa_SPV_HEX_FILE}
-    )
-
-    set(${SHADER_SPV_HEADER} ${LOCAL_SHADER_SPV_HEADER} PARENT_SCOPE)
-    set(${SHADER_SPV_HEX_HEADERS} ${LOCAL_SHADER_SPV_HEX_HEADERS} PARENT_SCOPE)
-
-endfunction()
diff --git a/codeformat.sh b/codeformat.sh
index 3e9cb33832a..21d128f4698 100755
--- a/codeformat.sh
+++ b/codeformat.sh
@@ -3,9 +3,9 @@
 # we run clang-format and astyle twice to get stable format output
 
 format_code() {
-    find src/ tools/ tests/ examples/ benchmark/ python/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.cc' -o -name '*.h' | grep -v python/pybind11 | grep -v stb_image | xargs -i clang-format -i {}
+    find src/ tools/ tests/ examples/ benchmark/ python/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.cc' -o -name '*.h' | grep -v python/pybind11 | grep -v stb_image | grep -v ruapu | xargs -i clang-format -i {}
     astyle -n -r "benchmark/*.h,*.cpp,*.cc" "tests/*.h,*.cpp,*.cc" "tools/*.h,*.cpp,*.cc" "examples/*.h,*.cpp,*.cc"
-    astyle -n -r "src/*.h,*.cpp,*.cc" --exclude=src/stb_image.h --exclude=src/stb_image_write.h
+    astyle -n -r "src/*.h,*.cpp,*.cc" --exclude=src/stb_image.h --exclude=src/stb_image_write.h --exclude=src/ruapu.h
     astyle -n -r "python/*.h,*.cpp,*.cc" --exclude=python/pybind11
 }
 
diff --git a/docs/Home.md b/docs/Home.md
index f1108b7b8ef..7f377e1b1f7 100644
--- a/docs/Home.md
+++ b/docs/Home.md
@@ -21,8 +21,6 @@ int main()
     net.load_model("model.bin");
 
     ncnn::Extractor ex = net.create_extractor();
-    ex.set_light_mode(true);
-    ex.set_num_threads(4);
 
     ex.input("data", in);
 
diff --git a/docs/developer-guide/arm-a53-a55-dual-issue.md b/docs/developer-guide/arm-a53-a55-dual-issue.md
index 7344747a8c0..ace5e7092a4 100644
--- a/docs/developer-guide/arm-a53-a55-dual-issue.md
+++ b/docs/developer-guide/arm-a53-a55-dual-issue.md
@@ -51,20 +51,23 @@ fmla
 ```
 
 ## A55
-* 128bit vector load cannot be dual issued with fmla, wait 2 cycles
-* 64bit vector load can be dual issued with fmla, no penalty
+* Limited by the number of neon register read and write ports, most neon instructions cannot be dual-issued.
+* neon instructions have different latencies
+* 128bit vector load cannot be issued with fmla, WAR wait 2 cycles
 * 64bit integer load can be dual issued with fmla, no penalty
 * pointer update can be dual issued with fmla, no penalty
 * 64bit vector insert can be dual issued with fmla, no penalty
 
 ### practical guide
-* use 64bit vector load only
-* load 64bit, dual issue with fmla
+* A55 supports 128bit load and 256bit write in one clock. Support dual emission of two 64bit vector loads or single emission of 128bit vector load
+* `ldr`, dual issue with fmla
 * load the remaining 64bit into integer register, dual issue with fmla
 * update pointer, dual issue with fmla
 * insert 64bit into vector from integer register, dual issue with fmla
 * interleaved load loose register dependency
 * nop trick is not needed
+* Loop unrolling fma reduces pipeline bubbles
+* Some data type conversion neon instructions can be dual issued, such as `fsvts`
 ```
 ldr     d0, [r0] // 0 cycle, v0 first 64bit
 fmla
diff --git a/docs/developer-guide/glsl-extension.md b/docs/developer-guide/glsl-extension.md
index 185ca0e49cb..82ae035e46d 100644
--- a/docs/developer-guide/glsl-extension.md
+++ b/docs/developer-guide/glsl-extension.md
@@ -99,7 +99,7 @@ void main()
 )";
 
 Option opt;
- // you can control the extention behavior
+ // you can control the extension behavior
  // even if the gpu supports 16bit storage
 opt.use_fp16_storage = false;
 
@@ -170,10 +170,10 @@ declare variable in shared local memory
 shared lfp tmp_a[8][4][2];
 ```
 
-|local type|fp32|fp16p / fp16s|fp16s + fp16a|
-|---|---|---|---|
-|lfp|float|float|float16_t|
-|lfpvec4|vec4|uvec2|f16vec4|
+|local type|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u|
+|---|---|---|---|---|
+|lfp|float|float|float|float16_t|
+|lfpvec4|vec4|uvec2|uint64_t|f16vec4|
 
 ## image format and precision hint type
 
diff --git a/docs/developer-guide/glsl-extension.zh.md b/docs/developer-guide/glsl-extension.zh.md
index 9b0718adec5..1e856929ac3 100644
--- a/docs/developer-guide/glsl-extension.zh.md
+++ b/docs/developer-guide/glsl-extension.zh.md
@@ -170,10 +170,10 @@ void main()
 shared lfp tmp_a[8][4][2];
 ```
 
-|local type|fp32|fp16p / fp16s|fp16s + fp16a|
-|---|---|---|---|
-|lfp|float|float|float16_t|
-|lfpvec4|vec4|uvec2|f16vec4|
+|local type|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u|
+|---|---|---|---|---|
+|lfp|float|float|float|float16_t|
+|lfpvec4|vec4|uvec2|uint64_t|f16vec4|
 
 ## 图像格式类型(image format type)和精度类型(precision hint type)
 
diff --git a/docs/developer-guide/layer-feat-mask.md b/docs/developer-guide/layer-feat-mask.md
new file mode 100644
index 00000000000..caff65c2144
--- /dev/null
+++ b/docs/developer-guide/layer-feat-mask.md
@@ -0,0 +1,111 @@
+# layer feature mask
+
+Each ncnn layer allows a special parameter pair `31=X` to control specific bahavior.
+
+X is an unsigned integer with each bit contributing a feature mask.
+
+We usually use it to configuring fine-graded behaviors for certain layers to maintain accuracy, reduce memory usage or optimize performance.
+
+|bit|value|mask|rationale|
+|---|---|---|---|
+|1<<0|1|no fp16 arithmetic|precision concern|
+|1<<1|2|no fp16 storage|precision concern|
+|1<<2|4|no bf16 storage|precision concern|
+|1<<3|8|no int8|debug dynamic quantized model|
+|1<<4|16|no vulkan|reduce overhead for cpu op - gpu split - cpu op|
+|1<<5|32|no sgemm|reduce some memory|
+|1<<6|64|no winograd|reduce some memory|
+|1<<7|128|no threading|force single thread|
+
+These bits can be OR-combined into one value to control multiple behaviors simultaneously.
+
+For example, `31=17` means disabling both vulkan and fp16 arithmetic.
+
+## disable fp16 for certain layer to fix overflow
+
+```ruby
+7767517
+3 3
+Input           input   0 1 input0 0=22 1=22 2=32
+Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
+Convolution     conv1   1 1 conv0 conv1 0=128 1=3 6=36864 9=1
+```
+
+Typically, we use fp16 computation to improve inference speed.
+However, since the weight value of `conv1` is very large, fp16 accumulation may cause numerical overflow, so fp16 needs to be disabled individually for `conv1`, while other layers continue to use fp16 mode
+
+Add `31=3` to disable fp16 storage and arithmetic.
+
+```ruby
+7767517
+3 3
+Input           input   0 1 input0 0=22 1=22 2=32
+Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
+Convolution     conv1   1 1 conv0 conv1 0=128 1=3 6=36864 9=1 31=3
+```
+
+## disable vulkan for certain layer to improve performance
+
+```ruby
+7767517
+5 5
+Input           input   0 1 input0 0=22 1=22 2=32
+Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
+SomeCPULayer    c0      1 1 conv0 c0 0=32
+ReLU            relu0   1 1 c0 relu0
+SomeCPULayer    c1      1 1 relu0 c1 0=32
+```
+
+Between the CPU layers, there is a simple calculation layer that supports vulkan. We can set `31=16` to force it to run on CPU. This can avoid the overhead of data upload, download and storage layout conversion between CPU and GPU. After all, CPU is fast enough for simple operations.
+
+```ruby
+7767517
+5 5
+Input           input   0 1 input0 0=22 1=22 2=32
+Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
+SomeCPULayer    c0      1 1 conv0 c0 0=32
+ReLU            relu0   1 1 c0 relu0 31=16
+SomeCPULayer    c1      1 1 relu0 c1 0=32
+```
+
+## disable winograd for certain layer to reduce memory usage
+
+```ruby
+7767517
+3 3
+Input           input   0 1 input0 0=22 1=22 2=32
+Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
+Convolution     conv1   1 1 conv0 conv1 0=128 1=3 6=36864 9=1
+```
+
+The winograd technology uses more memory for the purpose of improving convolution performance, but this is not always true. In some memory-constrained situations, or memory IO bottlenecks, we can disable the use of winograd on some layers in exchange for a smaller memory footprint. Add `31=64` to Convolution layer, which forces it to use implcit-gemm or tiled im2col-gemm implementation, reducing memory usage and sometimes improving vulkan performance.
+
+```ruby
+7767517
+3 3
+Input           input   0 1 input0 0=22 1=22 2=32
+Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
+Convolution     conv1   1 1 conv0 conv1 0=128 1=3 6=36864 9=1 31=64
+```
+
+## disable threading for certain layer to improve performance
+
+```ruby
+7767517
+4 4
+Input           input   0 1 input0 0=22 1=22 2=3
+Convolution     conv0   1 1 input0 conv0 0=16 1=3 6=432
+HardSigmoid     hs      1 1 conv0 hs0
+Convolution     conv1   1 1 hs0 conv1 0=16 1=3 6=2304
+```
+
+The overhead of multi-thread dispatch and merging is too large for small tensors. Add `31=128` to HardSigmoid layer, which forces it to execute in a single thread, reducing power consumption and improving performance.
+
+```ruby
+7767517
+4 4
+Input           input   0 1 input0 0=22 1=22 2=3
+Convolution     conv0   1 1 input0 conv0 0=16 1=3 6=432
+HardSigmoid     hs      1 1 conv0 hs0 31=128
+Convolution     conv1   1 1 hs0 conv1 0=16 1=3 6=2304
+```
diff --git a/docs/faq.en.md b/docs/faq.en.md
index 071809808fe..807c4a9e3ee 100644
--- a/docs/faq.en.md
+++ b/docs/faq.en.md
@@ -262,7 +262,7 @@ Fully customizable op, first change to one that can export (e.g. concat slice),
 
    Set net.opt.use_vulkan_compute = true before load_param / load_model;
 
-- ## How to exexcite multiple blob inputs, multiple blob outputs？
+- ## How to ececute multiple blob inputs, multiple blob outputs？
    Multiple execute `ex.input()` and `ex.extract()` like following
     ```
     ex.input("data1", in_1);
diff --git a/docs/how-to-build/build-mlir2ncnn.md b/docs/how-to-build/build-mlir2ncnn.md
index f975824c7cc..521f20cff82 100644
--- a/docs/how-to-build/build-mlir2ncnn.md
+++ b/docs/how-to-build/build-mlir2ncnn.md
@@ -8,7 +8,7 @@ https://github.com/llvm/llvm-project.git
 git checkout -b mlir <a_working_commit_id>
 ```
 Current working commit id is 74e6030bcbcc8e628f9a99a424342a0c656456f9:
-```
+```bash
 $ git log
 
 commit 74e6030bcbcc8e628f9a99a424342a0c656456f9 (HEAD -> main, origin/main, origin/HEAD)
@@ -49,6 +49,6 @@ See https://zhuanlan.zhihu.com/p/152535430
 
 **Usage mlir2ncnn**
 
-```
+```bash
 ./mlir2ncnn pix2pix.mlir pix2pix.param pix2pix.bin
 ```
diff --git a/docs/how-to-build/how-to-build.md b/docs/how-to-build/how-to-build.md
index e7cbf472726..c15e1d9485b 100644
--- a/docs/how-to-build/how-to-build.md
+++ b/docs/how-to-build/how-to-build.md
@@ -180,7 +180,7 @@ cmake --build . --config Release --target install
 ```
 (optional) Download and install Vulkan SDK from https://vulkan.lunarg.com/sdk/home
 
-Build ncnn library (replace <protobuf-root-dir> with a proper path):
+Build ncnn library (replace `<protobuf-root-dir>` with a proper path):
 
 ```shell
 cd <ncnn-root-dir>
@@ -193,6 +193,43 @@ cmake --build . --config Release --target install
 
 Note: To speed up compilation process on multi core machines, configuring `cmake` to use `jom` or `ninja` using `-G` flag is recommended.
 
+Note: For protobuf >=22.0 (Take v25.3 for example):
+
+Build zlib:
+```shell
+git clone -b -v1.3.1 https://github.com/madler/zlib.git
+cd zlib
+mkdir build
+cd build
+cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install ..
+cmake --build . --config Release -j 2
+cmake --build . --config Release --target install
+```
+
+Build protobuf library (replace `<zlib-root-dir>` with a proper path):
+```shell
+git clone -b v25.3 https://github.com/protocolbuffers/protobuf.git
+cd protobuf
+git submodule update --init --recursive
+
+mkdir protobuf_build
+cd protobuf_build
+cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -DCMAKE_CXX_STANDARD=14 -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DZLIB_INCLUDE_DIR=<zlib-root-dir>\build\install\include -DZLIB_LIBRARY=<zlib-root-dir>\build\install\lib\zlib.lib -DABSL_PROPAGATE_CXX_STD=ON ../cmake
+cmake --build . --config Release -j 2
+cmake --build . --config Release --target install
+```
+
+Build ncnn library (replace `<zlib-root-dir>` and `<protobuf-root-dir>` with a proper path):
+
+```shell
+cd <ncnn-root-dir>
+mkdir -p build
+cd build
+cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -DCMAKE_PREFIX_PATH=<protobuf-root-dir>/protobuf_build\install\cmake -DZLIB_INCLUDE_DIR=<zlib-root-dir>\build\install\include -DZLIB_LIBRARY=<zlib-root-dir>\build\install\lib\zlib.lib -Dabsl_DIR=<protobuf-root-dir>/protobuf_build\install\lib\cmake\absl -Dutf8_range_DIR=<protobuf-root-dir>/protobuf_build\install\lib\cmake\utf8_range -DNCNN_VULKAN=ON ..
+cmake --build . --config Release -j 2
+cmake --build . --config Release --target install
+```
+
 ***
 ### Build for macOS
 
@@ -215,13 +252,13 @@ Download and install Vulkan SDK from <https://vulkan.lunarg.com/sdk/home>
 
 
 ```shell
-wget https://sdk.lunarg.com/sdk/download/1.2.189.0/mac/vulkansdk-macos-1.2.189.0.dmg?Human=true -O vulkansdk-macos-1.2.189.0.dmg
-hdiutil attach vulkansdk-macos-1.2.189.0.dmg
-sudo /Volumes/vulkansdk-macos-1.2.189.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root `pwd`/vulkansdk-macos-1.2.189.0 --accept-licenses --default-answer --confirm-command install
-hdiutil detach /Volumes/vulkansdk-macos-1.2.189.0
+wget https://sdk.lunarg.com/sdk/download/1.3.280.1/mac/vulkansdk-macos-1.3.280.1.dmg -O vulkansdk-macos-1.3.280.1.dmg
+hdiutil attach vulkansdk-macos-1.3.280.1.dmg
+sudo /Volumes/vulkansdk-macos-1.3.280.1/InstallVulkan.app/Contents/MacOS/InstallVulkan --root `pwd`/vulkansdk-macos-1.3.280.1 --accept-licenses --default-answer --confirm-command install
+hdiutil detach /Volumes/vulkansdk-macos-1.3.280.1
 
 # setup env
-export VULKAN_SDK=`pwd`/vulkansdk-macos-1.2.189.0/macOS
+export VULKAN_SDK=`pwd`/vulkansdk-macos-1.3.280.1/macOS
 ```
 
 ```shell
@@ -229,9 +266,8 @@ cd <ncnn-root-dir>
 mkdir -p build
 cd build
 
-cmake -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" \
-    -DVulkan_INCLUDE_DIR=`pwd`/../vulkansdk-macos-1.2.189.0/MoltenVK/include \
-    -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.2.189.0/MoltenVK/dylib/macOS/libMoltenVK.dylib \
+cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=MAC -DARCHS="x86_64;arm64" \
+    -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.3.280.1/macOS/lib/libMoltenVK.dylib \
     -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=ON ..
 
 cmake --build . -j 4
@@ -330,12 +366,7 @@ cd build-android-armv7
 
 cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \
     -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON \
-    -DANDROID_PLATFORM=android-14 ..
-
-# If you want to enable Vulkan, platform api version >= android-24 is needed
-cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \
-    -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON \
-    -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+    -DANDROID_PLATFORM=android-14 -DNCNN_VULKAN=ON ..
 
 # If you use cmake >= 3.21 and ndk-r23
 # you need to add -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False option for working optimization flags
@@ -356,12 +387,7 @@ cd build-android-aarch64
 
 cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake"\
     -DANDROID_ABI="arm64-v8a" \
-    -DANDROID_PLATFORM=android-21 ..
-
-# If you want to enable Vulkan, platform api version >= android-24 is needed
-cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \
-    -DANDROID_ABI="arm64-v8a" \
-    -DANDROID_PLATFORM=android-24 -DNCNN_VULKAN=ON ..
+    -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
 
 # If you use cmake >= 3.21 and ndk-r23
 # you need to add -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False option for working optimization flags
@@ -395,7 +421,7 @@ mkdir -p build-ios
 cd build-ios
 
 cmake -DCMAKE_TOOLCHAIN_FILE=<ncnn-root-dir>/toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \
-    -DIOS_PLATFORM=OS -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="armv7;arm64;arm64e" \
+    -DPLATFORM=OS64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DARCHS="arm64;arm64e" \
     -DPERL_EXECUTABLE=/usr/local/bin/perl \
     -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF ..
 
@@ -422,7 +448,7 @@ mkdir -p build-ios-sim
 cd build-ios-sim
 
 cmake -DCMAKE_TOOLCHAIN_FILE=<ncnn-root-dir>/toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \
-    -DIOS_PLATFORM=SIMULATOR -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DIOS_ARCH="i386;x86_64" \
+    -DPLATFORM=SIMULATORARM64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DARCHS="x86_64;arm64" \
     -DPERL_EXECUTABLE=/usr/local/bin/perl \
     -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF ..
 
@@ -469,21 +495,11 @@ git submodule update --init
 mkdir -p build-ios
 cd build-ios
 
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS -DIOS_ARCH="armv7;arm64;arm64e" \
-    -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \
-    -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-    -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-    -DOpenMP_libomp_LIBRARY="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \
-    -DNCNN_BUILD_BENCHMARK=OFF ..
-
-# vulkan is only available on arm64 devices
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=OS64 -DIOS_ARCH="arm64;arm64e" \
+cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=OS64 -DARCHS="arm64;arm64e" \
     -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \
     -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
     -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
     -DOpenMP_libomp_LIBRARY="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \
-    -DVulkan_INCLUDE_DIR=$VULKAN_SDK/../MoltenVK/include \
-    -DVulkan_LIBRARY=$VULKAN_SDK/../MoltenVK/dylib/iOS/libMoltenVK.dylib \
     -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
 
 cmake --build . -j 4
@@ -497,7 +513,7 @@ cd <ncnn-root-dir>
 mkdir -p build-ios-sim
 cd build-ios-sim
 
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DIOS_PLATFORM=SIMULATOR -DIOS_ARCH="i386;x86_64" \
+cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=SIMULATORARM64 -DARCHS="x86_64;arm64" \
     -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \
     -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
     -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
@@ -508,7 +524,7 @@ cmake --build . -j 4
 cmake --build . --target install
 ```
 
-Package glslang framework:
+Package glslang framework for iPhoneOS:
 ```shell
 cd <ncnn-root-dir>
 
@@ -519,13 +535,12 @@ ln -s Versions/Current/Headers glslang.framework/Headers
 ln -s Versions/Current/Resources glslang.framework/Resources
 ln -s Versions/Current/glslang glslang.framework/glslang
 libtool -static build-ios/install/lib/libglslang.a build-ios/install/lib/libMachineIndependent.a build-ios/install/lib/libGenericCodeGen.a build-ios/install/lib/libSPIRV.a build-ios/install/lib/libOGLCompiler.a build-ios/install/lib/libOSDependent.a -o build-ios/install/lib/libglslang_combined.a
-libtool -static build-ios-sim/install/lib/libglslang.a build-ios-sim/install/lib/libMachineIndependent.a build-ios-sim/install/lib/libGenericCodeGen.a build-ios-sim/install/lib/libSPIRV.a build-ios-sim/install/lib/libOGLCompiler.a build-ios-sim/install/lib/libOSDependent.a -o build-ios-sim/install/lib/libglslang_combined.a
-lipo -create build-ios/install/lib/libglslang_combined.a build-ios-sim/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang
+lipo -create build-ios/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang
 cp -r build/install/include/glslang glslang.framework/Versions/A/Headers/
 sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
 ```
 
-Package ncnn framework:
+Package ncnn framework for iPhoneOS:
 ```shell
 cd <ncnn-root-dir>
 
@@ -535,7 +550,7 @@ ln -s A ncnn.framework/Versions/Current
 ln -s Versions/Current/Headers ncnn.framework/Headers
 ln -s Versions/Current/Resources ncnn.framework/Resources
 ln -s Versions/Current/ncnn ncnn.framework/ncnn
-lipo -create build-ios/install/lib/libncnn.a build-ios-sim/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn
+lipo -create build-ios/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn
 cp -r build-ios/install/include/* ncnn.framework/Versions/A/Headers/
 sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
 ```
@@ -753,7 +768,7 @@ Pick `build-qnx/install` folder for further usage.
 Install DevkitPRO toolchains
 - If you are working on windows, download DevkitPro installer from [DevkitPro](https://devkitpro.org/wiki/Getting_Started).
 - If you are using Ubuntu, the official guidelines from DevkitPro might not work for you. Try using the lines below to install 
-```
+```shell
 sudo apt-get update
 sudo apt-get upgrade
 wget https://apt.devkitpro.org/install-devkitpro-pacman
@@ -761,14 +776,14 @@ chmod +x ./install-devkitpro-pacman
 sudo ./install-devkitpro-pacman
 ```
 
-```
+```shell
 export DEVKITPRO=/opt/devkitpro
 export DEVKITARM=/opt/devkitpro/devkitARM
 export DEVKITPPC=/opt/devkitpro/devkitPPC
 export export PATH=$/opt/devkitpro/tools/bin:$PATH
 source ~/.profile
 ```
-```
+```shell
 sudo dkp-pacman -Sy
 sudo dkp-pacman -Syu
 sudo dkp-pacman -S 3ds-dev
@@ -796,7 +811,7 @@ Copy the toolchain files from [3DS-cmake](https://github.com/Xtansia/3ds-cmake)(
 
 ```
 Build with:
-```
+```shell
 cd ncnn
 mkdir build && cd build
 cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/DevkitArm3DS.cmake .. -DNCNN_SIMPLEOCV=ON -DNCNN_OPENMP=OFF -DNCNN_VFPV4=OFF ..
diff --git a/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md b/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md
index 0c17a306738..60d9f2c639a 100644
--- a/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md
+++ b/docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md
@@ -39,7 +39,7 @@ Most of these systems are android with version lower than 8.1.
 
 In the beginning, I had no GPGPU programming experience, and I had to learn one.
 
-vulkan is considered more portable and well supported by venders and the cross-platform low-overhead graphics api. As a contrast, cuda is only available on nvidia device, metal is only available on macos and ios, while loading opencl library is banned in android 7.0+ and does not work on ios.
+vulkan is considered more portable and well supported by vendors and the cross-platform low-overhead graphics api. As a contrast, cuda is only available on nvidia device, metal is only available on macos and ios, while loading opencl library is banned in android 7.0+ and does not work on ios.
 
 ### I got errors like "vkCreateComputePipelines failed -1000012000" or random stalls or crashes
 
@@ -87,7 +87,7 @@ It is common that your model runs slower on gpu than cpu on arm devices like mob
 
 ### vulkan device not found / extra high cpu utility while vulkan is enabled on nvidia gpu
 
-There are severel reasons could lead to this outcome. First please check your driver status with `nvidia-smi`. If you have correctly installed your driver, you should see something like this:
+There are several reasons could lead to this outcome. First please check your driver status with `nvidia-smi`. If you have correctly installed your driver, you should see something like this:
 
 ```bash
 $ nvidia-smi
diff --git a/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md b/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md
index c23b050ba27..29b2a0fc586 100644
--- a/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md
+++ b/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md
@@ -103,7 +103,6 @@ Execute the network inference and retrieve the result
 ncnn::Mat in;// input blob as above
 ncnn::Mat out;
 ncnn::Extractor ex = net.create_extractor();
-ex.set_light_mode(true);
 ex.input("data", in);
 ex.extract("prob", out);
 ```
@@ -114,7 +113,6 @@ If you load model with binary param.bin file, you should use the enum value in a
 ncnn::Mat in;// input blob as above
 ncnn::Mat out;
 ncnn::Extractor ex = net.create_extractor();
-ex.set_light_mode(true);
 ex.input(alexnet_param_id::BLOB_data, in);
 ex.extract(alexnet_param_id::BLOB_prob, out);
 ```
@@ -131,10 +129,6 @@ for (int j=0; j<out_flatterned.w; j++)
 
 ### some tricks
 
-Set multithreading thread number with Extractor
-```cpp
-ex.set_num_threads(4);
-```
 Convert image colorspace and resize image with Mat convenient function, these functions are well optimized
 
 Support RGB2GRAY GRAY2RGB RGB2BGR etc, support scale up and scale down
diff --git a/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.zh.md b/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.zh.md
index 8044057415d..e3931ddab5e 100644
--- a/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.zh.md
+++ b/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.zh.md
@@ -94,7 +94,6 @@ in.substract_mean_normalize(mean_vals, 0);
 ncnn::Mat in;// input blob as above
 ncnn::Mat out;
 ncnn::Extractor ex = net.create_extractor();
-ex.set_light_mode(true);
 ex.input("data", in);
 ex.extract("prob", out);
 ```
@@ -105,7 +104,6 @@ ex.extract("prob", out);
 ncnn::Mat in;// input blob as above
 ncnn::Mat out;
 ncnn::Extractor ex = net.create_extractor();
-ex.set_light_mode(true);
 ex.input(alexnet_param_id::BLOB_data, in);
 ex.extract(alexnet_param_id::BLOB_prob, out);
 ```
@@ -121,10 +119,6 @@ for (int j=0; j<out_flatterned.w; j++)
 ```
 ### 某些使用技巧
 
-Extractor 有个多线程加速的开关，设置线程数能加快计算
-```cpp
-ex.set_num_threads(4);
-```
 Mat 转换图像的时候可以顺便转换颜色和缩放大小，这些顺带的操作也是有优化的
 支持 RGB2GRAY GRAY2RGB RGB2BGR 等常用转换，支持缩小和放大
 ```cpp
diff --git a/python/ncnn/model_zoo/yolov8.py b/python/ncnn/model_zoo/yolov8.py
index 6a512809a9a..0debb936510 100644
--- a/python/ncnn/model_zoo/yolov8.py
+++ b/python/ncnn/model_zoo/yolov8.py
@@ -18,6 +18,7 @@
 from .model_store import get_model_file
 from ..utils.objects import Detect_Object
 from ..utils.functional import *
+from typing import Iterable
 
 class YoloV8s:
     def __init__(
@@ -204,17 +205,20 @@ def __call__(self, img):
             pred, self.prob_threshold, self.nms_threshold
         )[0]
 
-        objects = [
-            Detect_Object(
-                obj[5],
-                obj[4],
-                (obj[0] - (wpad / 2)) / scale,
-                (obj[1] - (hpad / 2)) / scale,
-                (obj[2] - obj[0]) / scale,
-                (obj[3] - obj[1]) / scale,
-            )
-            for obj in result
-        ]
+        if isinstance(result, Iterable):
+            objects = [
+                Detect_Object(
+                    obj[5],
+                    obj[4],
+                    (obj[0] - (wpad / 2)) / scale,
+                    (obj[1] - (hpad / 2)) / scale,
+                    (obj[2] - obj[0]) / scale,
+                    (obj[3] - obj[1]) / scale,
+                )
+                for obj in result
+            ]
+        else:
+            objects = []
 
         return objects
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d53bb1099e0..9ba035b9422 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -51,8 +51,6 @@ ncnn_src_group(ncnn_SRCS "sources")
 
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}")
 
-include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_shader_spv_header.cmake)
-
 # ncnn macro
 include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_add_shader.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_add_layer.cmake)
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 2a7ee3421e3..5662d1b5155 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1458,6 +1458,88 @@ void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int
     copy_cut_border_3d(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, front, behind, _opt);
 }
 
+#if NCNN_PIXEL_DRAWING
+void ncnn_draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness)
+{
+    ncnn::draw_rectangle_c1(pixels, w, h, w, rx, ry, rw, rh, color, thickness);
+}
+
+void ncnn_draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness)
+{
+    ncnn::draw_rectangle_c2(pixels, w, h, w * 2, rx, ry, rw, rh, color, thickness);
+}
+
+void ncnn_draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness)
+{
+    ncnn::draw_rectangle_c3(pixels, w, h, w * 3, rx, ry, rw, rh, color, thickness);
+}
+
+void ncnn_draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness)
+{
+    ncnn::draw_rectangle_c4(pixels, w, h, w * 4, rx, ry, rw, rh, color, thickness);
+}
+
+void ncnn_draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color)
+{
+    ncnn::draw_text_c1(pixels, w, h, w, text, x, y, fontpixelsize, color);
+}
+
+void ncnn_draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color)
+{
+    ncnn::draw_text_c2(pixels, w, h, w * 2, text, x, y, fontpixelsize, color);
+}
+
+void ncnn_draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color)
+{
+    ncnn::draw_text_c3(pixels, w, h, w * 3, text, x, y, fontpixelsize, color);
+}
+
+void ncnn_draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color)
+{
+    ncnn::draw_text_c4(pixels, w, h, w * 4, text, x, y, fontpixelsize, color);
+}
+
+void ncnn_draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness)
+{
+    ncnn::draw_circle_c1(pixels, w, h, w, cx, cy, radius, color, thickness);
+}
+
+void ncnn_draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness)
+{
+    ncnn::draw_circle_c2(pixels, w, h, w * 2, cx, cy, radius, color, thickness);
+}
+
+void ncnn_draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness)
+{
+    ncnn::draw_circle_c3(pixels, w, h, w * 3, cx, cy, radius, color, thickness);
+}
+
+void ncnn_draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness)
+{
+    ncnn::draw_circle_c4(pixels, w, h, w * 4, cx, cy, radius, color, thickness);
+}
+
+void ncnn_draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness)
+{
+    ncnn::draw_line_c1(pixels, w, h, w, x0, y0, x1, y1, color, thickness);
+}
+
+void ncnn_draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness)
+{
+    ncnn::draw_line_c2(pixels, w, h, w * 2, x0, y0, x1, y1, color, thickness);
+}
+
+void ncnn_draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness)
+{
+    ncnn::draw_line_c3(pixels, w, h, w * 3, x0, y0, x1, y1, color, thickness);
+}
+
+void ncnn_draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness)
+{
+    ncnn::draw_line_c4(pixels, w, h, w * 4, x0, y0, x1, y1, color, thickness);
+}
+#endif /* NCNN_PIXEL_DRAWING */
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/c_api.h b/src/c_api.h
index 31d5b6d7089..ddd8bfff20c 100644
--- a/src/c_api.h
+++ b/src/c_api.h
@@ -338,6 +338,29 @@ NCNN_EXPORT void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst,
 NCNN_EXPORT void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt);
 NCNN_EXPORT void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt);
 
+#if NCNN_PIXEL_DRAWING
+/* mat pixel drawing api*/
+NCNN_EXPORT void ncnn_draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void ncnn_draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void ncnn_draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+NCNN_EXPORT void ncnn_draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
+
+NCNN_EXPORT void ncnn_draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void ncnn_draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void ncnn_draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+NCNN_EXPORT void ncnn_draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
+
+NCNN_EXPORT void ncnn_draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void ncnn_draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void ncnn_draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+NCNN_EXPORT void ncnn_draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
+
+NCNN_EXPORT void ncnn_draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void ncnn_draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void ncnn_draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+NCNN_EXPORT void ncnn_draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
+#endif /* NCNN_PIXEL_DRAWING */
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/cpu.cpp b/src/cpu.cpp
index e56cd67098f..b3ca317b010 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -58,7 +58,10 @@
 #include <sys/system_properties.h> // __system_property_get()
 #include <dlfcn.h>
 #endif
+#include <ctype.h>
 #include <stdint.h>
+#include <fcntl.h>
+#include <sys/stat.h>
 #include <sys/syscall.h>
 #include <unistd.h>
 #endif
@@ -69,6 +72,7 @@
 #include <mach/thread_act.h>
 #include <sys/sysctl.h>
 #include <sys/types.h>
+#include <unistd.h>
 #include "TargetConditionals.h"
 #if TARGET_OS_IPHONE
 #define __IOS__ 1
@@ -115,6 +119,9 @@
 #include <immintrin.h>
 #endif
 
+#define RUAPU_IMPLEMENTATION
+#include "ruapu.h"
+
 // topology info
 static int g_cpucount;
 static int g_physical_cpucount;
@@ -125,9 +132,6 @@ static ncnn::CpuSet g_cpu_affinity_mask_big;
 
 // isa info
 #if defined _WIN32
-#if __arm__
-static int g_cpu_support_arm_neon;
-static int g_cpu_support_arm_vfpv4;
 #if __aarch64__
 static int g_cpu_support_arm_asimdhp;
 static int g_cpu_support_arm_cpuid;
@@ -140,10 +144,11 @@ static int g_cpu_support_arm_sve2;
 static int g_cpu_support_arm_svebf16;
 static int g_cpu_support_arm_svei8mm;
 static int g_cpu_support_arm_svef32mm;
-#else  // __aarch64__
+#elif __arm__
 static int g_cpu_support_arm_edsp;
-#endif // __aarch64__
-#endif // __arm__
+static int g_cpu_support_arm_neon;
+static int g_cpu_support_arm_vfpv4;
+#endif // __aarch64__ || __arm__
 #elif defined __ANDROID__ || defined __linux__
 static unsigned int g_hwcaps;
 static unsigned int g_hwcaps2;
@@ -183,171 +188,59 @@ static int g_cpu_is_arm_a53_a55;
 #endif // __aarch64__
 #endif // defined __ANDROID__ || defined __linux__
 
-#if defined _WIN32
-static int g_sigill_caught = 0;
-static jmp_buf g_jmpbuf;
-
-static LONG CALLBACK catch_sigill(struct _EXCEPTION_POINTERS* ExceptionInfo)
+static bool is_being_debugged()
 {
-    if (ExceptionInfo->ExceptionRecord->ExceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION)
-    {
-        g_sigill_caught = 1;
-        longjmp(g_jmpbuf, -1);
-    }
+#if defined _WIN32
+    return IsDebuggerPresent();
+#elif defined __ANDROID__ || defined __linux__
+    // https://stackoverflow.com/questions/3596781/how-to-detect-if-the-current-process-is-being-run-by-gdb
+    int status_fd = open("/proc/self/status", O_RDONLY);
+    if (status_fd == -1)
+        return false;
 
-    return EXCEPTION_CONTINUE_SEARCH;
-}
+    char buf[4096];
+    ssize_t num_read = read(status_fd, buf, sizeof(buf) - 1);
+    close(status_fd);
 
-static int detectisa(const void* some_inst)
-{
-    g_sigill_caught = 0;
+    if (num_read <= 0)
+        return false;
 
-    PVOID eh = AddVectoredExceptionHandler(1, catch_sigill);
+    buf[num_read] = '\0';
+    const char tracerPidString[] = "TracerPid:";
+    const char* tracer_pid_ptr = strstr(buf, tracerPidString);
+    if (!tracer_pid_ptr)
+        return false;
 
-    if (setjmp(g_jmpbuf) == 0)
+    for (const char* ch = tracer_pid_ptr + sizeof(tracerPidString) - 1; ch <= buf + num_read; ++ch)
     {
-        ((void (*)())some_inst)();
-    }
-
-    RemoveVectoredExceptionHandler(eh);
-
-    return g_sigill_caught ? 0 : 1;
-}
-
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-#ifdef _MSC_VER
-#define DEFINE_INSTCODE(name, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned char name[] = {__VA_ARGS__, 0xc3};
-#else
-#define DEFINE_INSTCODE(name, ...) __attribute__((section(".text"))) static unsigned char name[] = {__VA_ARGS__, 0xc3};
-#endif
-#elif __aarch64__
-#ifdef _MSC_VER
-#define DEFINE_INSTCODE(name, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int name[] = {__VA_ARGS__, 0xd65f03c0};
-#else
-#define DEFINE_INSTCODE(name, ...) __attribute__((section(".text"))) static unsigned int name[] = {__VA_ARGS__, 0xd65f03c0};
-#endif
-#elif __arm__
-#ifdef _MSC_VER
-#define DEFINE_INSTCODE(name, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int name[] = {__VA_ARGS__, 0x4770bf00};
-#else
-#define DEFINE_INSTCODE(name, ...) __attribute__((section(".text"))) static unsigned int name[] = {__VA_ARGS__, 0x4770bf00};
-#endif
-#endif
-
-#elif defined __ANDROID__ || defined __linux__ || defined __APPLE__
-static int g_sigill_caught = 0;
-static sigjmp_buf g_jmpbuf;
-
-static void catch_sigill(int /*signo*/, siginfo_t* /*si*/, void* /*data*/)
-{
-    g_sigill_caught = 1;
-    siglongjmp(g_jmpbuf, -1);
-}
-
-static int detectisa(void (*some_inst)())
-{
-    g_sigill_caught = 0;
-
-    struct sigaction sa;
-    struct sigaction old_sa;
-    memset(&sa, 0, sizeof(sa));
-    sa.sa_sigaction = catch_sigill;
-    sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO;
-    sigaction(SIGILL, &sa, &old_sa);
+        if (isspace(*ch))
+            continue;
 
-    if (sigsetjmp(g_jmpbuf, 1) == 0)
-    {
-        some_inst();
+        return isdigit(*ch) != 0 && *ch != '0';
     }
 
-    sigaction(SIGILL, &old_sa, NULL);
+    return false;
+#elif defined __APPLE__
+    // https://stackoverflow.com/questions/2200277/detecting-debugger-on-mac-os-x
+    struct kinfo_proc info;
+    info.kp_proc.p_flag = 0;
 
-    return g_sigill_caught ? 0 : 1;
-}
+    int mib[4];
+    mib[0] = CTL_KERN;
+    mib[1] = KERN_PROC;
+    mib[2] = KERN_PROC_PID;
+    mib[3] = getpid();
 
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-#define DEFINE_INSTCODE(name, ...)         \
-    static void name()                     \
-    {                                      \
-        asm volatile(".byte " #__VA_ARGS__ \
-                     :                     \
-                     :                     \
-                     :);                   \
-    };
-#elif __aarch64__
-#define DEFINE_INSTCODE(name, ...)         \
-    static void name()                     \
-    {                                      \
-        asm volatile(".word " #__VA_ARGS__ \
-                     :                     \
-                     :                     \
-                     :);                   \
-    };
-#elif __arm__
-#define DEFINE_INSTCODE(name, ...)         \
-    static void name()                     \
-    {                                      \
-        asm volatile(".word " #__VA_ARGS__ \
-                     :                     \
-                     :                     \
-                     :);                   \
-    };
-#endif
-
-#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
-
-#if defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
-DEFINE_INSTCODE(some_mmx, 0x0f, 0xdb, 0xc0)                           // pand mm0,mm0
-DEFINE_INSTCODE(some_sse, 0x0f, 0x54, 0xc0)                           // andps xmm0,xmm0
-DEFINE_INSTCODE(some_sse2, 0x66, 0x0f, 0xfe, 0xc0)                    // paddd xmm0,xmm0
-DEFINE_INSTCODE(some_sse3, 0xf2, 0x0f, 0x7c, 0xc0)                    // haddps xmm0,xmm0
-DEFINE_INSTCODE(some_ssse3, 0x66, 0x0f, 0x38, 0x06, 0xc0)             // phsubd xmm0,xmm0
-DEFINE_INSTCODE(some_sse41, 0x66, 0x0f, 0x38, 0x3d, 0xc0)             // pmaxsd xmm0,xmm0
-DEFINE_INSTCODE(some_sse42, 0x66, 0x0f, 0x38, 0x37, 0xc0)             // pcmpgtq xmm0,xmm0
-DEFINE_INSTCODE(some_sse4a, 0x66, 0x0f, 0x79, 0xc0)                   // extrq xmm0,xmm0
-DEFINE_INSTCODE(some_xop, 0x8f, 0xe8, 0x78, 0xb6, 0xc0, 0x00)         // vpmadcswd %xmm0,%xmm0,%xmm0,%xmm0
-DEFINE_INSTCODE(some_avx, 0xc5, 0xfc, 0x54, 0xc0)                     // vandps ymm0,ymm0,ymm0
-DEFINE_INSTCODE(some_f16c, 0xc4, 0xe2, 0x7d, 0x13, 0xc0)              // vcvtph2ps ymm0,xmm0
-DEFINE_INSTCODE(some_fma, 0xc4, 0xe2, 0x7d, 0x98, 0xc0)               // vfmadd132ps ymm0,ymm0,ymm0
-DEFINE_INSTCODE(some_avx2, 0xc5, 0xfd, 0xfe, 0xc0)                    // vpaddd ymm0,ymm0,ymm0
-DEFINE_INSTCODE(some_avx512f, 0x62, 0xf1, 0x7c, 0x48, 0x58, 0xc0)     // vaddps zmm0,zmm0,zmm0
-DEFINE_INSTCODE(some_avx512bw, 0x62, 0xf1, 0x7d, 0x48, 0xfd, 0xc0)    // vpaddw zmm0,zmm0,zmm0
-DEFINE_INSTCODE(some_avx512cd, 0x62, 0xf2, 0xfd, 0x48, 0x44, 0xc0)    // vplzcntq zmm0,zmm0
-DEFINE_INSTCODE(some_avx512dq, 0x62, 0xf1, 0x7c, 0x48, 0x54, 0xc0)    // vandps zmm0,zmm0,zmm0
-DEFINE_INSTCODE(some_avx512vl, 0x62, 0xf2, 0xfd, 0x28, 0x1f, 0xc0)    // vpabsq ymm0,ymm0
-DEFINE_INSTCODE(some_avx512vnni, 0x62, 0xf2, 0x7d, 0x48, 0x52, 0xc0)  // vpdpwssd %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avx512bf16, 0x62, 0xf2, 0x7e, 0x48, 0x52, 0xc0)  // vdpbf16ps %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avx512ifma, 0x62, 0xf2, 0xfd, 0x48, 0xb4, 0xc0)  // vpmadd52luq %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avx512vbmi, 0x62, 0xf2, 0x7d, 0x48, 0x75, 0xc0)  // vpermi2b %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avx512vbmi2, 0x62, 0xf2, 0x7d, 0x48, 0x71, 0xc0) // vpshldvd %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avx512fp16, 0x62, 0xf6, 0x7d, 0x48, 0x98, 0xc0)  // vfmadd132ph %zmm0,%zmm0,%zmm0
-DEFINE_INSTCODE(some_avxvnni, 0x62, 0xf2, 0x7d, 0x28, 0x52, 0xc0)     // vpdpwssd ymm0,ymm0,ymm0
-DEFINE_INSTCODE(some_avxvnniint8, 0xc4, 0xe2, 0x7f, 0x50, 0xc0)       // vpdpbssd ymm0,ymm0,ymm0
-DEFINE_INSTCODE(some_avxifma, 0x62, 0xf2, 0xfd, 0x28, 0xb4, 0xc0)     // vpmadd52luq %ymm0,%ymm0,%ymm0
-
-#elif __aarch64__
-DEFINE_INSTCODE(some_neon, 0x4e20d400)     // fadd v0.4s,v0.4s,v0.4s
-DEFINE_INSTCODE(some_vfpv4, 0x0e216800)    // fcvtn v0.4h,v0.4s
-DEFINE_INSTCODE(some_cpuid, 0xd5380000)    // mrs x0,midr_el1
-DEFINE_INSTCODE(some_asimdhp, 0x0e401400)  // fadd v0.4h,v0.4h,v0.4h
-DEFINE_INSTCODE(some_asimddp, 0x4e809400)  // sdot v0.4h,v0.16b,v0.16b
-DEFINE_INSTCODE(some_asimdfhm, 0x4e20ec00) // fmlal v0.4s,v0.4h,v0.4h
-DEFINE_INSTCODE(some_bf16, 0x6e40ec00)     // bfmmla v0.4h,v0.8h,v0.8h
-DEFINE_INSTCODE(some_i8mm, 0x4e80a400)     // smmla v0.4h,v0.16b,v0.16b
-DEFINE_INSTCODE(some_sve, 0x65608000)      // fmad z0.h,p0/m,z0.h,z0.h
-DEFINE_INSTCODE(some_sve2, 0x44405000)     // smlslb z0.h,z0.b,z0.b
-DEFINE_INSTCODE(some_svebf16, 0x6460e400)  // bfmmla z0.s,z0.h,z0.h
-DEFINE_INSTCODE(some_svei8mm, 0x45009800)  // smmla z0.s,z0.b,z0.b
-DEFINE_INSTCODE(some_svef32mm, 0x64a0e400) // fmmla z0.s,z0.s,z0.s
-
-#elif __arm__
-DEFINE_INSTCODE(some_edsp, 0x0000fb20)  // smlad r0,r0,r0,r0
-DEFINE_INSTCODE(some_neon, 0x0d40ef00)  // vadd.f32 q0,q0,q0
-DEFINE_INSTCODE(some_vfpv4, 0x0600ffb6) // vcvt.f16.f32 d0,q0
+    size_t size = sizeof(info);
+    sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, NULL, 0);
 
+    return ((info.kp_proc.p_flag & P_TRACED) != 0);
+#else
+    // unknown platform :(
+    fprintf(stderr, "unknown platform!\n");
+    return false;
 #endif
-#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
+}
 
 #if defined __ANDROID__ || defined __linux__
 
@@ -694,7 +587,7 @@ static int get_cpu_support_x86_avx2()
 static int get_cpu_support_x86_avx_vnni()
 {
 #if __APPLE__
-    return detectisa(some_avxvnni);
+    return ruapu_supports("avxvnni");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -720,7 +613,7 @@ static int get_cpu_support_x86_avx_vnni()
 static int get_cpu_support_x86_avx512()
 {
 #if __APPLE__
-    return detectisa(some_avx512f) && detectisa(some_avx512bw) && detectisa(some_avx512cd) && detectisa(some_avx512dq) && detectisa(some_avx512vl);
+    return ruapu_supports("avx512f") && ruapu_supports("avx512bw") && ruapu_supports("avx512cd") && ruapu_supports("avx512dq") && ruapu_supports("avx512vl");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -750,7 +643,7 @@ static int get_cpu_support_x86_avx512()
 static int get_cpu_support_x86_avx512_vnni()
 {
 #if __APPLE__
-    return detectisa(some_avx512vnni);
+    return ruapu_supports("avx512vnni");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -780,7 +673,7 @@ static int get_cpu_support_x86_avx512_vnni()
 static int get_cpu_support_x86_avx512_bf16()
 {
 #if __APPLE__
-    return detectisa(some_avx512bf16);
+    return ruapu_supports("avx512bf16");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -806,7 +699,7 @@ static int get_cpu_support_x86_avx512_bf16()
 static int get_cpu_support_x86_avx512_fp16()
 {
 #if __APPLE__
-    return detectisa(some_avx512fp16);
+    return ruapu_supports("avx512fp16");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -1964,26 +1857,31 @@ static void initialize_global_cpu_info()
     g_powersave = 0;
     initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
 
+#if (defined _WIN32 && (__aarch64__ || __arm__)) || __APPLE__
+    if (!is_being_debugged())
+    {
+        ruapu_init();
+    }
+#endif
+
 #if defined _WIN32
-#if __arm__
-    g_cpu_support_arm_neon = detectisa(some_neon);
-    g_cpu_support_arm_vfpv4 = detectisa(some_vfpv4);
 #if __aarch64__
-    g_cpu_support_arm_cpuid = detectisa(some_cpuid);
-    g_cpu_support_arm_asimdhp = detectisa(some_asimdhp);
-    g_cpu_support_arm_asimddp = detectisa(some_asimddp);
-    g_cpu_support_arm_asimdfhm = detectisa(some_asimdfhm);
-    g_cpu_support_arm_bf16 = detectisa(some_bf16);
-    g_cpu_support_arm_i8mm = detectisa(some_i8mm);
-    g_cpu_support_arm_sve = detectisa(some_sve);
-    g_cpu_support_arm_sve2 = detectisa(some_sve2);
-    g_cpu_support_arm_svebf16 = detectisa(some_svebf16);
-    g_cpu_support_arm_svei8mm = detectisa(some_svei8mm);
-    g_cpu_support_arm_svef32mm = detectisa(some_svef32mm);
-#else  // __aarch64__
-    g_cpu_support_arm_edsp = detectisa(some_edsp);
-#endif // __aarch64__
-#endif // __arm__
+    g_cpu_support_arm_cpuid = ruapu_supports("cpuid");
+    g_cpu_support_arm_asimdhp = ruapu_supports("asimdhp") || IsProcessorFeaturePresent(43); // dp implies hp
+    g_cpu_support_arm_asimddp = ruapu_supports("asimddp") || IsProcessorFeaturePresent(43); // 43 is PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
+    g_cpu_support_arm_asimdfhm = ruapu_supports("asimdfhm");
+    g_cpu_support_arm_bf16 = ruapu_supports("bf16");
+    g_cpu_support_arm_i8mm = ruapu_supports("i8mm");
+    g_cpu_support_arm_sve = ruapu_supports("sve");
+    g_cpu_support_arm_sve2 = ruapu_supports("sve2");
+    g_cpu_support_arm_svebf16 = ruapu_supports("svebf16");
+    g_cpu_support_arm_svei8mm = ruapu_supports("svei8mm");
+    g_cpu_support_arm_svef32mm = ruapu_supports("svef32mm");
+#elif __arm__
+    g_cpu_support_arm_edsp = ruapu_supports("edsp");
+    g_cpu_support_arm_neon = 1; // all modern windows arm devices have neon
+    g_cpu_support_arm_vfpv4 = ruapu_supports("vfpv4");
+#endif // __aarch64__ || __arm__
 #elif defined __ANDROID__ || defined __linux__
     g_hwcaps = get_elf_hwcap(AT_HWCAP);
     g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
@@ -2196,21 +2094,15 @@ int cpu_support_arm_edsp()
 int cpu_support_arm_neon()
 {
     try_initialize_global_cpu_info();
-#if __arm__
+#if __aarch64__
+    return 1;
+#elif __arm__
 #if defined _WIN32
     return g_cpu_support_arm_neon;
 #elif defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    return g_hwcaps & HWCAP_ASIMD;
-#else
     return g_hwcaps & HWCAP_NEON;
-#endif
 #elif __APPLE__
-#if __aarch64__
-    return g_hw_cputype == CPU_TYPE_ARM64;
-#else
     return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
-#endif
 #else
     return 0;
 #endif
@@ -2222,22 +2114,15 @@ int cpu_support_arm_neon()
 int cpu_support_arm_vfpv4()
 {
     try_initialize_global_cpu_info();
-#if __arm__
+#if __aarch64__
+    return 1;
+#elif __arm__
 #if defined _WIN32
     return g_cpu_support_arm_vfpv4;
 #elif defined __ANDROID__ || defined __linux__
-#if __aarch64__
-    // neon always enable fma and fp16
-    return g_hwcaps & HWCAP_ASIMD;
-#else
     return g_hwcaps & HWCAP_VFPv4;
-#endif
 #elif __APPLE__
-#if __aarch64__
-    return g_hw_cputype == CPU_TYPE_ARM64;
-#else
     return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
-#endif
 #else
     return 0;
 #endif
@@ -2952,7 +2837,7 @@ int get_omp_thread_num()
 
 int get_kmp_blocktime()
 {
-#if defined(_OPENMP) && __clang__
+#if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
     return kmp_get_blocktime();
 #else
     return 0;
@@ -2961,7 +2846,7 @@ int get_kmp_blocktime()
 
 void set_kmp_blocktime(int time_ms)
 {
-#if defined(_OPENMP) && __clang__
+#if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
     kmp_set_blocktime(time_ms);
 #else
     (void)time_ms;
diff --git a/src/gpu.cpp b/src/gpu.cpp
index 224dcddc235..da26f72f53d 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -26,7 +26,7 @@
 #include "glslang/glslang/Public/ShaderLang.h"
 #endif
 
-#include "vulkan_activation.comp.hex.h"
+#include "layer/vulkan/shader/vulkan_activation.comp.hex.h"
 
 #include "command.h"
 #include "layer.h"
@@ -51,6 +51,7 @@ class __ncnn_vulkan_instance_holder
     {
         instance = 0;
         created = 0;
+        glslang_initialized = false;
 
 #if NCNN_VULKAN_LOADER
         libvulkan = 0;
@@ -76,6 +77,7 @@ class __ncnn_vulkan_instance_holder
 
     VkInstance instance;
     int created;
+    bool glslang_initialized;
 
 #if ENABLE_VALIDATION_LAYER
     VkDebugUtilsMessengerEXT callback;
@@ -321,9 +323,11 @@ class GpuInfoPrivate
     // fp16 and int8 feature
     bool support_fp16_packed;
     bool support_fp16_storage;
+    bool support_fp16_uniform;
     bool support_fp16_arithmetic;
     bool support_int8_packed;
     bool support_int8_storage;
+    bool support_int8_uniform;
     bool support_int8_arithmetic;
 
     // ycbcr conversion feature
@@ -331,6 +335,7 @@ class GpuInfoPrivate
 
     // cooperative matrix
     bool support_cooperative_matrix;
+    bool support_cooperative_matrix_8_8_16;
     bool support_cooperative_matrix_16_8_8;
     bool support_cooperative_matrix_16_8_16;
     bool support_cooperative_matrix_16_16_16;
@@ -605,6 +610,11 @@ bool GpuInfo::support_fp16_storage() const
     return d->support_fp16_storage;
 }
 
+bool GpuInfo::support_fp16_uniform() const
+{
+    return d->support_fp16_uniform;
+}
+
 bool GpuInfo::support_fp16_arithmetic() const
 {
     return d->support_fp16_arithmetic;
@@ -620,6 +630,11 @@ bool GpuInfo::support_int8_storage() const
     return d->support_int8_storage;
 }
 
+bool GpuInfo::support_int8_uniform() const
+{
+    return d->support_int8_uniform;
+}
+
 bool GpuInfo::support_int8_arithmetic() const
 {
     return d->support_int8_arithmetic;
@@ -635,6 +650,11 @@ bool GpuInfo::support_cooperative_matrix() const
     return d->support_cooperative_matrix;
 }
 
+bool GpuInfo::support_cooperative_matrix_8_8_16() const
+{
+    return d->support_cooperative_matrix_8_8_16;
+}
+
 bool GpuInfo::support_cooperative_matrix_16_8_8() const
 {
     return d->support_cooperative_matrix_16_8_8;
@@ -1772,12 +1792,15 @@ int create_gpu_instance(const char* driver_path)
         // check features
         gpu_info.support_fp16_packed = true;
         gpu_info.support_fp16_storage = false;
+        gpu_info.support_fp16_uniform = false;
         gpu_info.support_fp16_arithmetic = false;
         gpu_info.support_int8_packed = true;
         gpu_info.support_int8_storage = false;
+        gpu_info.support_int8_uniform = false;
         gpu_info.support_int8_arithmetic = false;
         gpu_info.support_ycbcr_conversion = false;
         gpu_info.support_cooperative_matrix = false;
+        gpu_info.support_cooperative_matrix_8_8_16 = false;
         gpu_info.support_cooperative_matrix_16_8_8 = false;
         gpu_info.support_cooperative_matrix_16_8_16 = false;
         gpu_info.support_cooperative_matrix_16_16_16 = false;
@@ -1852,30 +1875,18 @@ int create_gpu_instance(const char* driver_path)
             if (gpu_info.support_VK_KHR_8bit_storage)
             {
                 gpu_info.support_int8_storage = query8BitStorageFeatures.storageBuffer8BitAccess;
+                gpu_info.support_int8_uniform = query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess;
             }
             if (gpu_info.support_VK_KHR_16bit_storage && queryFeatures.features.shaderStorageImageExtendedFormats)
             {
                 // shaderStorageImageExtendedFormats enables r16f format in storage image
                 gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess;
+                gpu_info.support_fp16_uniform = query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess;
             }
             if (gpu_info.support_VK_KHR_shader_float16_int8)
             {
-                if (gpu_info.support_fp16_storage)
-                {
-                    gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16 && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess;
-                }
-                else
-                {
-                    gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16;
-                }
-                if (gpu_info.support_int8_storage)
-                {
-                    gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8 && query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess;
-                }
-                else
-                {
-                    gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8;
-                }
+                gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16;
+                gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8;
             }
             if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
             {
@@ -1945,6 +1956,13 @@ int create_gpu_instance(const char* driver_path)
                     const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
                     // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);
 
+                    if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
+                            && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
+                            && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
+                            && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+                    {
+                        gpu_info.support_cooperative_matrix_8_8_16 = true;
+                    }
                     if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
                             && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
@@ -1994,6 +2012,13 @@ int create_gpu_instance(const char* driver_path)
                     const VkCooperativeMatrixPropertiesNV& cmp = properties[j];
                     // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);
 
+                    if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
+                            && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
+                            && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
+                            && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+                    {
+                        gpu_info.support_cooperative_matrix_8_8_16 = true;
+                    }
                     if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
                             && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                             && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
@@ -2027,17 +2052,17 @@ int create_gpu_instance(const char* driver_path)
         NCNN_LOGE("[%u %s]  bugsbn1=%d  bugbilz=%d  bugcopc=%d  bugihfa=%d", i, physicalDeviceProperties.deviceName,
                   gpu_info.bug_storage_buffer_no_l1, gpu_info.bug_buffer_image_load_zero, gpu_info.bug_corrupted_online_pipeline_cache, gpu_info.bug_implicit_fp16_arithmetic);
 
-        NCNN_LOGE("[%u %s]  fp16-p/s/a=%d/%d/%d  int8-p/s/a=%d/%d/%d", i, physicalDeviceProperties.deviceName,
-                  gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
-                  gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);
+        NCNN_LOGE("[%u %s]  fp16-p/s/u/a=%d/%d/%d/%d  int8-p/s/u/a=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
+                  gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_uniform, gpu_info.support_fp16_arithmetic,
+                  gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_uniform, gpu_info.support_int8_arithmetic);
 
         NCNN_LOGE("[%u %s]  subgroup=%u  basic/vote/ballot/shuffle=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
                   gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote,
                   gpu_info.support_subgroup_ballot, gpu_info.support_subgroup_shuffle);
 
-        NCNN_LOGE("[%u %s]  fp16-matrix-16_8_8/16_8_16/16_16_16=%d/%d/%d", i, physicalDeviceProperties.deviceName,
-                  gpu_info.support_cooperative_matrix_16_8_8, gpu_info.support_cooperative_matrix_16_8_16,
-                  gpu_info.support_cooperative_matrix_16_16_16);
+        NCNN_LOGE("[%u %s]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
+                  gpu_info.support_cooperative_matrix_8_8_16, gpu_info.support_cooperative_matrix_16_8_8,
+                  gpu_info.support_cooperative_matrix_16_8_16, gpu_info.support_cooperative_matrix_16_16_16);
 
         gpu_info_index++;
     }
@@ -2047,7 +2072,7 @@ int create_gpu_instance(const char* driver_path)
     // the default gpu device
     g_default_gpu_index = find_default_vulkan_device_index();
 
-    glslang::InitializeProcess();
+    g_instance.glslang_initialized = glslang::InitializeProcess();
 
     // the global __ncnn_vulkan_instance_holder destructor will call destroy_gpu_instance() on exit
     // but it seems to be too late for nvidia driver :(
@@ -2077,7 +2102,11 @@ void destroy_gpu_instance()
 
     // NCNN_LOGE("destroy_gpu_instance");
 
-    glslang::FinalizeProcess();
+    if (g_instance.glslang_initialized)
+    {
+        glslang::FinalizeProcess();
+        g_instance.glslang_initialized = false;
+    }
 
     for (int i = 0; i < NCNN_MAX_GPU_COUNT; i++)
     {
@@ -2089,14 +2118,18 @@ void destroy_gpu_instance()
     }
 
 #if ENABLE_VALIDATION_LAYER
-    if (support_VK_EXT_debug_utils)
+    if (support_VK_EXT_debug_utils && g_instance.callback)
     {
         DestroyDebugUtilsMessengerEXT(g_instance, g_instance.callback, NULL);
         g_instance.callback = 0;
     }
 #endif // ENABLE_VALIDATION_LAYER
 
-    vkDestroyInstance(g_instance, 0);
+    if (vkDestroyInstance)
+    {
+        vkDestroyInstance(g_instance, 0);
+        vkDestroyInstance = 0;
+    }
 
     g_instance.instance = 0;
 
@@ -2481,7 +2514,7 @@ VulkanDevice::VulkanDevice(int device_index)
     enabled8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
     enabled8BitStorageFeatures.pNext = 0;
     enabled8BitStorageFeatures.storageBuffer8BitAccess = info.support_int8_storage();
-    enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_storage() && info.support_int8_arithmetic();
+    enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_uniform();
     enabled8BitStorageFeatures.storagePushConstant8 = VK_FALSE;
     if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_8bit_storage())
     {
@@ -2494,7 +2527,7 @@ VulkanDevice::VulkanDevice(int device_index)
     enabled16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
     enabled16BitStorageFeatures.pNext = 0;
     enabled16BitStorageFeatures.storageBuffer16BitAccess = info.support_fp16_storage();
-    enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_storage() && info.support_fp16_arithmetic();
+    enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_uniform();
     enabled16BitStorageFeatures.storagePushConstant16 = VK_FALSE;
     enabled16BitStorageFeatures.storageInputOutput16 = VK_FALSE;
     if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_16bit_storage())
@@ -3868,11 +3901,16 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
         custom_defines.push_back(std::make_pair("afpmat4", "mat4"));
     }
 
-    if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+    if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic)
     {
         custom_defines.push_back(std::make_pair("lfp", "float16_t"));
         custom_defines.push_back(std::make_pair("lfpvec4", "f16vec4"));
     }
+    else if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+    {
+        custom_defines.push_back(std::make_pair("lfp", "float"));
+        custom_defines.push_back(std::make_pair("lfpvec4", "uint64_t"));
+    }
     else if (opt.use_fp16_storage || opt.use_fp16_packed)
     {
         custom_defines.push_back(std::make_pair("lfp", "float"));
@@ -3884,7 +3922,7 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
         custom_defines.push_back(std::make_pair("lfpvec4", "vec4"));
     }
 
-    if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+    if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic)
     {
         custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
         custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v"));
@@ -3892,6 +3930,14 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
         custom_defines.push_back(std::make_pair("lfp2afp(v)", "v"));
         custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "v"));
     }
+    else if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
+    {
+        custom_defines.push_back(std::make_pair("sfp2lfp(v)", "float(v)"));
+        custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "pack64(halfBitsToUInt16(v))"));
+
+        custom_defines.push_back(std::make_pair("lfp2afp(v)", "float16_t(v)"));
+        custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "int16BitsToHalf(unpack16(v))"));
+    }
     else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
     {
         custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
@@ -4219,6 +4265,11 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
         custom_defines.push_back(std::make_pair("NCNN_fp16_packed", "1"));
     }
 
+    if (opt.use_fp16_uniform)
+    {
+        custom_defines.push_back(std::make_pair("NCNN_fp16_uniform", "1"));
+    }
+
     if (opt.use_fp16_arithmetic)
     {
         custom_defines.push_back(std::make_pair("NCNN_fp16_arithmetic", "1"));
@@ -4233,6 +4284,11 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
         custom_defines.push_back(std::make_pair("NCNN_int8_packed", "1"));
     }
 
+    if (opt.use_int8_uniform)
+    {
+        custom_defines.push_back(std::make_pair("NCNN_int8_uniform", "1"));
+    }
+
     if (opt.use_int8_arithmetic)
     {
         custom_defines.push_back(std::make_pair("NCNN_int8_arithmetic", "1"));
diff --git a/src/gpu.h b/src/gpu.h
index c3e5e8daeac..696f651ed2b 100644
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -260,9 +260,11 @@ class NCNN_EXPORT GpuInfo
     // fp16 and int8 feature
     bool support_fp16_packed() const;
     bool support_fp16_storage() const;
+    bool support_fp16_uniform() const;
     bool support_fp16_arithmetic() const;
     bool support_int8_packed() const;
     bool support_int8_storage() const;
+    bool support_int8_uniform() const;
     bool support_int8_arithmetic() const;
 
     // ycbcr conversion feature
@@ -270,6 +272,7 @@ class NCNN_EXPORT GpuInfo
 
     // cooperative matrix feature
     bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_8_8_16() const;
     bool support_cooperative_matrix_16_8_8() const;
     bool support_cooperative_matrix_16_8_16() const;
     bool support_cooperative_matrix_16_16_16() const;
diff --git a/src/layer.cpp b/src/layer.cpp
index 562576a5493..cca3e77bf1f 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -18,21 +18,7 @@
 
 #include <string.h>
 
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4250)
-#endif
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Woverloaded-virtual"
-#endif
 #include "layer_declaration.h"
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
 
 namespace ncnn {
 
@@ -221,9 +207,289 @@ Layer* create_layer(const char* type)
 
     return create_layer(index);
 }
+
+Layer* create_layer_naive(const char* type)
+{
+    int index = layer_to_index(type);
+    if (index == -1)
+        return 0;
+
+    return create_layer_naive(index);
+}
+
+Layer* create_layer_cpu(const char* type)
+{
+    int index = layer_to_index(type);
+    if (index == -1)
+        return 0;
+
+    return create_layer_cpu(index);
+}
+
+#if NCNN_VULKAN
+Layer* create_layer_vulkan(const char* type)
+{
+    int index = layer_to_index(type);
+    if (index == -1)
+        return 0;
+
+    return create_layer_vulkan(index);
+}
+#endif // NCNN_VULKAN
 #endif // NCNN_STRING
 
+// internal wrapper
+class Layer_final : public Layer
+{
+public:
+    Layer* layer_cpu;
+#if NCNN_VULKAN
+    Layer* layer_vulkan;
+#endif
+
+    // utility functions for transfer layer properties
+    void set_layer_properties()
+    {
+        layer_cpu->userdata = userdata;
+
+        layer_cpu->bottoms = bottoms;
+        layer_cpu->tops = tops;
+        layer_cpu->bottom_shapes = bottom_shapes;
+        layer_cpu->top_shapes = top_shapes;
+        layer_cpu->featmask = featmask;
+
+#if NCNN_VULKAN
+        if (layer_vulkan)
+        {
+            layer_vulkan->vkdev = vkdev;
+
+            layer_vulkan->userdata = userdata;
+
+            layer_vulkan->bottoms = bottoms;
+            layer_vulkan->tops = tops;
+            layer_vulkan->bottom_shapes = bottom_shapes;
+            layer_vulkan->top_shapes = top_shapes;
+            layer_vulkan->featmask = featmask;
+        }
+#endif
+    }
+
+    void get_layer_properties()
+    {
+        one_blob_only = layer_cpu->one_blob_only;
+        support_inplace = layer_cpu->support_inplace;
+        support_packing = layer_cpu->support_packing;
+        support_bf16_storage = layer_cpu->support_bf16_storage;
+        support_fp16_storage = layer_cpu->support_fp16_storage;
+        support_int8_storage = layer_cpu->support_int8_storage;
+
+        support_vulkan = 0;
+        support_image_storage = 0;
+        support_tensor_storage = 0;
+
+#if NCNN_VULKAN
+        if (layer_vulkan)
+        {
+            support_vulkan = layer_vulkan->support_vulkan;
+            support_image_storage = layer_vulkan->support_image_storage;
+            support_tensor_storage = layer_vulkan->support_tensor_storage;
+        }
+#endif
+    }
+
+public:
+    Layer_final()
+    {
+        layer_cpu = 0;
+#if NCNN_VULKAN
+        layer_vulkan = 0;
+#endif
+    }
+
+    ~Layer_final()
+    {
+        delete layer_cpu;
+#if NCNN_VULKAN
+        delete layer_vulkan;
+#endif
+    }
+
+    virtual int load_param(const ParamDict& pd)
+    {
+        set_layer_properties();
+#if NCNN_VULKAN
+        if (layer_vulkan)
+        {
+            if (vkdev)
+            {
+                int ret = layer_vulkan->load_param(pd);
+                get_layer_properties();
+
+                if (layer_vulkan->support_vulkan)
+                    return ret;
+            }
+
+            // fallback to cpu layer
+            delete layer_vulkan;
+            layer_vulkan = 0;
+        }
+#endif // NCNN_VULKAN
+
+        int ret = layer_cpu->load_param(pd);
+        get_layer_properties();
+        return ret;
+    }
+
+    virtual int load_model(const ModelBin& mb)
+    {
+#if NCNN_VULKAN
+        if (layer_vulkan)
+        {
+            int ret = layer_vulkan->load_model(mb);
+            get_layer_properties();
+            return ret;
+        }
+#endif // NCNN_VULKAN
+
+        int ret = layer_cpu->load_model(mb);
+        get_layer_properties();
+        return ret;
+    }
+
+    virtual int create_pipeline(const Option& opt)
+    {
+        set_layer_properties();
+#if NCNN_VULKAN
+        if (layer_vulkan)
+        {
+            int ret = layer_vulkan->create_pipeline(opt);
+            get_layer_properties();
+            return ret;
+        }
+#endif // NCNN_VULKAN
+
+        int ret = layer_cpu->create_pipeline(opt);
+        get_layer_properties();
+        return ret;
+    }
+
+    virtual int destroy_pipeline(const Option& opt)
+    {
+#if NCNN_VULKAN
+        if (layer_vulkan)
+        {
+            return layer_vulkan->destroy_pipeline(opt);
+        }
+#endif // NCNN_VULKAN
+
+        return layer_cpu->destroy_pipeline(opt);
+    }
+
+public:
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+    {
+        return layer_cpu->forward(bottom_blobs, top_blobs, opt);
+    }
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+    {
+        return layer_cpu->forward(bottom_blob, top_blob, opt);
+    }
+
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const
+    {
+        return layer_cpu->forward_inplace(bottom_top_blobs, opt);
+    }
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+    {
+        return layer_cpu->forward_inplace(bottom_top_blob, opt);
+    }
+
+#if NCNN_VULKAN
+public:
+    virtual int upload_model(VkTransfer& cmd, const Option& opt)
+    {
+        return layer_vulkan ? layer_vulkan->upload_model(cmd, opt) : -1;
+    }
+
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan ? layer_vulkan->forward(bottom_blobs, top_blobs, cmd, opt) : -1;
+    }
+
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan ? layer_vulkan->forward(bottom_blob, top_blob, cmd, opt) : -1;
+    }
+
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan ? layer_vulkan->forward(bottom_blobs, top_blobs, cmd, opt) : -1;
+    }
+
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan ? layer_vulkan->forward(bottom_blob, top_blob, cmd, opt) : -1;
+    }
+
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blobs, cmd, opt) : -1;
+    }
+
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blob, cmd, opt) : -1;
+    }
+
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blobs, cmd, opt) : -1;
+    }
+
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blob, cmd, opt) : -1;
+    }
+#endif // NCNN_VULKAN
+};
+
 Layer* create_layer(int index)
+{
+    Layer* layer_cpu = create_layer_cpu(index);
+    if (!layer_cpu)
+        return 0;
+
+    Layer_final* layer_final = new Layer_final;
+    layer_final->layer_cpu = layer_cpu;
+
+#if NCNN_VULKAN
+    layer_final->layer_vulkan = create_layer_vulkan(index);
+#endif
+
+    layer_final->typeindex = index;
+    layer_final->set_layer_properties();
+    layer_final->get_layer_properties();
+
+    return layer_final;
+}
+
+Layer* create_layer_naive(int index)
+{
+    if (index < 0 || index >= layer_registry_entry_count)
+        return 0;
+
+    layer_creator_func layer_creator = layer_registry[index].creator;
+    if (!layer_creator)
+        return 0;
+
+    Layer* layer = layer_creator(0);
+    layer->typeindex = index;
+    return layer;
+}
+
+Layer* create_layer_cpu(int index)
 {
     if (index < 0 || index >= layer_registry_entry_count)
         return 0;
@@ -280,6 +546,11 @@ Layer* create_layer(int index)
     }
     else
 #endif // NCNN_RUNTIME_CPU && NCNN_RVV
+    {
+        layer_creator = layer_registry_arch[index].creator;
+    }
+
+    if (!layer_creator)
     {
         layer_creator = layer_registry[index].creator;
     }
@@ -293,4 +564,20 @@ Layer* create_layer(int index)
     return layer;
 }
 
+#if NCNN_VULKAN
+Layer* create_layer_vulkan(int index)
+{
+    if (index < 0 || index >= layer_registry_entry_count)
+        return 0;
+
+    layer_creator_func layer_creator = layer_registry_vulkan[index].creator;
+    if (!layer_creator)
+        return 0;
+
+    Layer* layer = layer_creator(0);
+    layer->typeindex = index;
+    return layer;
+}
+#endif // NCNN_VULKAN
+
 } // namespace ncnn
diff --git a/src/layer.h b/src/layer.h
index 573f58cf94a..d44713de451 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -199,9 +199,19 @@ struct overwrite_builtin_layer_registry_entry
 NCNN_EXPORT int layer_to_index(const char* type);
 // create layer from type name
 NCNN_EXPORT Layer* create_layer(const char* type);
+NCNN_EXPORT Layer* create_layer_naive(const char* type);
+NCNN_EXPORT Layer* create_layer_cpu(const char* type);
+#if NCNN_VULKAN
+NCNN_EXPORT Layer* create_layer_vulkan(const char* type);
+#endif // NCNN_VULKAN
 #endif // NCNN_STRING
 // create layer from layer type
 NCNN_EXPORT Layer* create_layer(int index);
+NCNN_EXPORT Layer* create_layer_naive(int index);
+NCNN_EXPORT Layer* create_layer_cpu(int index);
+#if NCNN_VULKAN
+NCNN_EXPORT Layer* create_layer_vulkan(int index);
+#endif // NCNN_VULKAN
 
 #define DEFINE_LAYER_CREATOR(name)                          \
     ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
diff --git a/src/layer/arm/absval_arm.h b/src/layer/arm/absval_arm.h
index 7e6f150e7f1..c1cea1dfb9e 100644
--- a/src/layer/arm/absval_arm.h
+++ b/src/layer/arm/absval_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class AbsVal_arm : virtual public AbsVal
+class AbsVal_arm : public AbsVal
 {
 public:
     AbsVal_arm();
diff --git a/src/layer/arm/batchnorm_arm.h b/src/layer/arm/batchnorm_arm.h
index 9be82439cb4..1393bb30e12 100644
--- a/src/layer/arm/batchnorm_arm.h
+++ b/src/layer/arm/batchnorm_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BatchNorm_arm : virtual public BatchNorm
+class BatchNorm_arm : public BatchNorm
 {
 public:
     BatchNorm_arm();
diff --git a/src/layer/arm/bias_arm.h b/src/layer/arm/bias_arm.h
index a3b61cd300d..5f08facf17e 100644
--- a/src/layer/arm/bias_arm.h
+++ b/src/layer/arm/bias_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Bias_arm : virtual public Bias
+class Bias_arm : public Bias
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/arm/binaryop_arm.h b/src/layer/arm/binaryop_arm.h
index 6bb950495ce..1337065eb40 100644
--- a/src/layer/arm/binaryop_arm.h
+++ b/src/layer/arm/binaryop_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BinaryOp_arm : virtual public BinaryOp
+class BinaryOp_arm : public BinaryOp
 {
 public:
     BinaryOp_arm();
diff --git a/src/layer/arm/cast_arm.h b/src/layer/arm/cast_arm.h
index 190090a859a..fc32c70d3dd 100644
--- a/src/layer/arm/cast_arm.h
+++ b/src/layer/arm/cast_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_arm : virtual public Cast
+class Cast_arm : public Cast
 {
 public:
     Cast_arm();
diff --git a/src/layer/arm/cast_bf16.h b/src/layer/arm/cast_bf16.h
index 5ae5513145d..ab196687154 100644
--- a/src/layer/arm/cast_bf16.h
+++ b/src/layer/arm/cast_bf16.h
@@ -88,7 +88,7 @@ static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const
                 "vshrn.u32  d1, q1, #16     \n"
                 "vshrn.u32  d2, q2, #16     \n"
                 "vshrn.u32  d3, q3, #16     \n"
-                "vst1.u16   {d0-d3}, [%1 :128]! \n"
+                "vst1.u16   {d0-d3}, [%1]!  \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
                 : "0"(ptr),
@@ -231,7 +231,7 @@ static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
 #else  // __aarch64__
             asm volatile(
                 "pld        [%0, #256]      \n"
-                "vld1.u16   {d4-d7}, [%0 :128]! \n"
+                "vld1.u16   {d4-d7}, [%0]!  \n"
                 "vshll.u16  q0, d4, #16     \n"
                 "vshll.u16  q1, d5, #16     \n"
                 "vshll.u16  q2, d6, #16     \n"
diff --git a/src/layer/arm/cast_fp16.h b/src/layer/arm/cast_fp16.h
index b27a6ebd34f..7d5866d0a19 100644
--- a/src/layer/arm/cast_fp16.h
+++ b/src/layer/arm/cast_fp16.h
@@ -62,13 +62,13 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const
                 : "memory", "v0", "v1", "v2", "v3");
 #else  // __aarch64__
             asm volatile(
-                "pld        [%0, #512]          \n"
-                "vldm       %0!, {d0-d7}        \n"
-                "vcvt.f16.f32 d0, q0            \n"
-                "vcvt.f16.f32 d1, q1            \n"
-                "vcvt.f16.f32 d2, q2            \n"
-                "vcvt.f16.f32 d3, q3            \n"
-                "vst1.u16   {d0-d3}, [%1 :128]! \n"
+                "pld        [%0, #512]      \n"
+                "vldm       %0!, {d0-d7}    \n"
+                "vcvt.f16.f32 d0, q0        \n"
+                "vcvt.f16.f32 d1, q1        \n"
+                "vcvt.f16.f32 d2, q2        \n"
+                "vcvt.f16.f32 d3, q3        \n"
+                "vst1.u16   {d0-d3}, [%1]!  \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
                 : "0"(ptr),
@@ -220,13 +220,13 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const
                 : "memory", "v0", "v1", "v2", "v3");
 #else  // __aarch64__
             asm volatile(
-                "pld        [%0, #256]          \n"
-                "vld1.u16   {d4-d7}, [%0 :128]! \n"
-                "vcvt.f32.f16 q0, d4            \n"
-                "vcvt.f32.f16 q1, d5            \n"
-                "vcvt.f32.f16 q2, d6            \n"
-                "vcvt.f32.f16 q3, d7            \n"
-                "vstm       %1!, {d0-d7}        \n"
+                "pld        [%0, #256]      \n"
+                "vld1.u16   {d4-d7}, [%0]!  \n"
+                "vcvt.f32.f16 q0, d4        \n"
+                "vcvt.f32.f16 q1, d5        \n"
+                "vcvt.f32.f16 q2, d6        \n"
+                "vcvt.f32.f16 q3, d7        \n"
+                "vstm       %1!, {d0-d7}    \n"
                 : "=r"(ptr),   // %0
                 "=r"(outptr) // %1
                 : "0"(ptr),
diff --git a/src/layer/arm/clip_arm.h b/src/layer/arm/clip_arm.h
index 8af695172e1..ef281d249e7 100644
--- a/src/layer/arm/clip_arm.h
+++ b/src/layer/arm/clip_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_arm : virtual public Clip
+class Clip_arm : public Clip
 {
 public:
     Clip_arm();
diff --git a/src/layer/arm/concat_arm.h b/src/layer/arm/concat_arm.h
index c09dfa27568..9491a280110 100644
--- a/src/layer/arm/concat_arm.h
+++ b/src/layer/arm/concat_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_arm : virtual public Concat
+class Concat_arm : public Concat
 {
 public:
     Concat_arm();
diff --git a/src/layer/arm/convolution1d_arm.cpp b/src/layer/arm/convolution1d_arm.cpp
index 48368fb9cc6..26389279b18 100644
--- a/src/layer/arm/convolution1d_arm.cpp
+++ b/src/layer/arm/convolution1d_arm.cpp
@@ -68,6 +68,8 @@ int Convolution1D_arm::create_pipeline(const Option& opt)
 
     convolution1d_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w);
 
+    weight_data.release();
+
     return 0;
 }
 
@@ -196,7 +198,7 @@ int Convolution1D_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -237,6 +239,8 @@ int Convolution1D_arm::create_pipeline_bf16s(const Option& /*opt*/)
 
     convolution1d_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w);
 
+    weight_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/convolution1d_arm.h b/src/layer/arm/convolution1d_arm.h
index 83e0ea83809..48babb914d2 100644
--- a/src/layer/arm/convolution1d_arm.h
+++ b/src/layer/arm/convolution1d_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_arm : virtual public Convolution1D
+class Convolution1D_arm : public Convolution1D
 {
 public:
     Convolution1D_arm();
diff --git a/src/layer/arm/convolution1d_arm_asimdhp.cpp b/src/layer/arm/convolution1d_arm_asimdhp.cpp
index bbbd5883027..2e194eabf21 100644
--- a/src/layer/arm/convolution1d_arm_asimdhp.cpp
+++ b/src/layer/arm/convolution1d_arm_asimdhp.cpp
@@ -36,6 +36,8 @@ int Convolution1D_arm::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
+    weight_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index cde8c216873..f7f04619e9e 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -157,7 +157,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
 
     if ((!support_packing || !opt.use_packing_layout) && !opt.use_bf16_storage && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
     {
-        convolution_dilation1 = ncnn::create_layer(ncnn::LayerType::Convolution);
+        convolution_dilation1 = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -194,6 +194,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
 
         convolution_dilation1->create_pipeline(opt);
 
+        weight_data.release();
+
         return 0;
     }
 
@@ -222,10 +224,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
         else
             conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -271,10 +270,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
     {
         convolution_im2col_gemm_transform_kernel(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -309,10 +305,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
         convolution_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -807,7 +800,7 @@ int Convolution_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -911,10 +904,7 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
         else
             conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -960,10 +950,7 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
     {
         convolution_im2col_gemm_transform_kernel_bf16s(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -984,10 +971,7 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
         convolution_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1300,10 +1284,7 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h
index b70b339f046..e25f509c711 100644
--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_arm : virtual public Convolution
+class Convolution_arm : public Convolution
 {
 public:
     Convolution_arm();
diff --git a/src/layer/arm/convolution_arm_asimdhp.cpp b/src/layer/arm/convolution_arm_asimdhp.cpp
index 6480aa2e78a..51ec51675a8 100644
--- a/src/layer/arm/convolution_arm_asimdhp.cpp
+++ b/src/layer/arm/convolution_arm_asimdhp.cpp
@@ -108,10 +108,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
         else
             conv3x3s1_winograd23_transform_kernel_fp16sa(weight_data, weight_winograd23_data, num_input, num_output, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         if (opt.use_fp16_arithmetic)
         {
@@ -192,10 +189,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
 
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -225,10 +219,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp
index 30b07134260..f9f4a1fdc2d 100644
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -119,10 +119,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
                 ncnn::cast_float32_to_bfloat16(weight_data, weight_data_tm, opt);
             }
 
-            if (opt.lightmode)
-            {
-                weight_data.release();
-            }
+            weight_data.release();
 
             return 0;
         }
@@ -164,10 +161,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -175,10 +169,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -206,7 +197,7 @@ int ConvolutionDepthWise_arm::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -650,7 +641,7 @@ int ConvolutionDepthWise_arm::forward(const std::vector<Mat>& bottom_blobs, std:
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -1031,10 +1022,7 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
             weight_data_tm = weight_data;
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -1042,10 +1030,7 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/convolutiondepthwise_arm.h b/src/layer/arm/convolutiondepthwise_arm.h
index 412590f101e..8536c081320 100644
--- a/src/layer/arm/convolutiondepthwise_arm.h
+++ b/src/layer/arm/convolutiondepthwise_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_arm : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_arm : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_arm();
diff --git a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
index f7d2cfee84c..1d5f2782cc1 100644
--- a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
@@ -76,10 +76,7 @@ int ConvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
 
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -87,10 +84,7 @@ int ConvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/crop_arm.h b/src/layer/arm/crop_arm.h
index e3f6d5109a3..9f2bea6e1bd 100644
--- a/src/layer/arm/crop_arm.h
+++ b/src/layer/arm/crop_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_arm : virtual public Crop
+class Crop_arm : public Crop
 {
 public:
     Crop_arm();
diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp
index ef8a56f9931..24c825ae266 100644
--- a/src/layer/arm/deconvolution_arm.cpp
+++ b/src/layer/arm/deconvolution_arm.cpp
@@ -85,7 +85,7 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
     {
         const int maxk = kernel_w * kernel_h;
 
-        gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
         ncnn::ParamDict pd;
         pd.set(2, 1);                 // transA
@@ -211,10 +211,7 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -851,7 +848,7 @@ int Deconvolution_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -957,10 +954,7 @@ int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/deconvolution_arm.h b/src/layer/arm/deconvolution_arm.h
index 3c7979687cb..b4cdcbe0ee9 100644
--- a/src/layer/arm/deconvolution_arm.h
+++ b/src/layer/arm/deconvolution_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_arm : virtual public Deconvolution
+class Deconvolution_arm : public Deconvolution
 {
 public:
     Deconvolution_arm();
diff --git a/src/layer/arm/deconvolution_arm_asimdhp.cpp b/src/layer/arm/deconvolution_arm_asimdhp.cpp
index c98ba40309b..b5498d815f3 100644
--- a/src/layer/arm/deconvolution_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolution_arm_asimdhp.cpp
@@ -45,7 +45,7 @@ int Deconvolution_arm::create_pipeline_fp16s(const Option& opt)
     {
         const int maxk = kernel_w * kernel_h;
 
-        gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
         ncnn::ParamDict pd;
         pd.set(2, 1);                 // transA
@@ -154,10 +154,7 @@ int Deconvolution_arm::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp
index 478bd1740dc..4eac426d9de 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -104,10 +104,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
                 ncnn::cast_float32_to_bfloat16(weight_data_transposed, weight_data_tm, opt);
             }
 
-            if (opt.lightmode)
-            {
-                weight_data.release();
-            }
+            weight_data.release();
 
             return 0;
         }
@@ -148,7 +145,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
             if (bias_term)
                 bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+            ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
             // set param
             ncnn::ParamDict pd;
@@ -193,10 +190,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -562,7 +556,7 @@ int DeconvolutionDepthWise_arm::forward(const std::vector<Mat>& bottom_blobs, st
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.h b/src/layer/arm/deconvolutiondepthwise_arm.h
index 6eff45ede3a..a7ef393dd25 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.h
+++ b/src/layer/arm/deconvolutiondepthwise_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_arm : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_arm : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_arm();
diff --git a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
index 09e0fca4356..5fa42d07490 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
@@ -100,7 +100,7 @@ int DeconvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
             if (bias_term)
                 bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+            ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
             // set param
             ncnn::ParamDict pd;
@@ -145,10 +145,7 @@ int DeconvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/dequantize_arm.h b/src/layer/arm/dequantize_arm.h
index 5bba8de7fdd..677c731db69 100644
--- a/src/layer/arm/dequantize_arm.h
+++ b/src/layer/arm/dequantize_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dequantize_arm : virtual public Dequantize
+class Dequantize_arm : public Dequantize
 {
 public:
     Dequantize_arm();
diff --git a/src/layer/arm/dropout_arm.h b/src/layer/arm/dropout_arm.h
index 395c5a9d02c..9a970525aae 100644
--- a/src/layer/arm/dropout_arm.h
+++ b/src/layer/arm/dropout_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dropout_arm : virtual public Dropout
+class Dropout_arm : public Dropout
 {
 public:
     Dropout_arm();
diff --git a/src/layer/arm/eltwise_arm.h b/src/layer/arm/eltwise_arm.h
index 5480f2293ce..6bd91f5dab5 100644
--- a/src/layer/arm/eltwise_arm.h
+++ b/src/layer/arm/eltwise_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Eltwise_arm : virtual public Eltwise
+class Eltwise_arm : public Eltwise
 {
 public:
     Eltwise_arm();
diff --git a/src/layer/arm/flatten_arm.h b/src/layer/arm/flatten_arm.h
index 92932ba7744..9bc9a0d1b99 100644
--- a/src/layer/arm/flatten_arm.h
+++ b/src/layer/arm/flatten_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_arm : virtual public Flatten
+class Flatten_arm : public Flatten
 {
 public:
     Flatten_arm();
diff --git a/src/layer/arm/gelu_arm.h b/src/layer/arm/gelu_arm.h
index 283f063bb69..5be9fc4d6d5 100644
--- a/src/layer/arm/gelu_arm.h
+++ b/src/layer/arm/gelu_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GELU_arm : virtual public GELU
+class GELU_arm : public GELU
 {
 public:
     GELU_arm();
diff --git a/src/layer/arm/gemm_arm.cpp b/src/layer/arm/gemm_arm.cpp
index 2d4ff8734f8..3463550d3d4 100644
--- a/src/layer/arm/gemm_arm.cpp
+++ b/src/layer/arm/gemm_arm.cpp
@@ -4201,10 +4201,7 @@ int Gemm_arm::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -4244,10 +4241,7 @@ int Gemm_arm::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -4277,10 +4271,7 @@ int Gemm_arm::create_pipeline(const Option& opt)
             CT_data = C2;
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
@@ -4898,10 +4889,7 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -4941,10 +4929,7 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -4974,10 +4959,7 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
             CT_data = C2;
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
diff --git a/src/layer/arm/gemm_arm.h b/src/layer/arm/gemm_arm.h
index e4e4b81f2ee..0c1eab108ba 100644
--- a/src/layer/arm/gemm_arm.h
+++ b/src/layer/arm/gemm_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Gemm_arm : virtual public Gemm
+class Gemm_arm : public Gemm
 {
 public:
     Gemm_arm();
diff --git a/src/layer/arm/gemm_arm_asimdhp.cpp b/src/layer/arm/gemm_arm_asimdhp.cpp
index ff840df3b50..cfe6ce8ce60 100644
--- a/src/layer/arm/gemm_arm_asimdhp.cpp
+++ b/src/layer/arm/gemm_arm_asimdhp.cpp
@@ -2736,10 +2736,7 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -2779,10 +2776,7 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -2808,10 +2802,7 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
diff --git a/src/layer/arm/gemm_arm_vfpv4.cpp b/src/layer/arm/gemm_arm_vfpv4.cpp
index 3d29af41860..5792e47e980 100644
--- a/src/layer/arm/gemm_arm_vfpv4.cpp
+++ b/src/layer/arm/gemm_arm_vfpv4.cpp
@@ -427,10 +427,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -470,10 +467,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -503,10 +497,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
             CT_data = C2;
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
diff --git a/src/layer/arm/gru_arm.cpp b/src/layer/arm/gru_arm.cpp
index 70df351a555..58df8275ad5 100644
--- a/src/layer/arm/gru_arm.cpp
+++ b/src/layer/arm/gru_arm.cpp
@@ -250,6 +250,10 @@ int GRU_arm::create_pipeline(const Option& opt)
         }
     }
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
@@ -1368,6 +1372,10 @@ int GRU_arm::create_pipeline_bf16s(const Option& opt)
         }
     }
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/gru_arm.h b/src/layer/arm/gru_arm.h
index e1e8fbb08fd..6eae1656b01 100644
--- a/src/layer/arm/gru_arm.h
+++ b/src/layer/arm/gru_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GRU_arm : virtual public GRU
+class GRU_arm : public GRU
 {
 public:
     GRU_arm();
diff --git a/src/layer/arm/gru_arm_asimdhp.cpp b/src/layer/arm/gru_arm_asimdhp.cpp
index c38458176af..fcdce2d8e18 100644
--- a/src/layer/arm/gru_arm_asimdhp.cpp
+++ b/src/layer/arm/gru_arm_asimdhp.cpp
@@ -914,6 +914,10 @@ int GRU_arm::create_pipeline_fp16s(const Option& opt)
         }
     }
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/hardsigmoid_arm.h b/src/layer/arm/hardsigmoid_arm.h
index bfa04828ac6..13783ff1690 100644
--- a/src/layer/arm/hardsigmoid_arm.h
+++ b/src/layer/arm/hardsigmoid_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_arm : virtual public HardSigmoid
+class HardSigmoid_arm : public HardSigmoid
 {
 public:
     HardSigmoid_arm();
diff --git a/src/layer/arm/hardswish_arm.h b/src/layer/arm/hardswish_arm.h
index 7309ba6c71f..a534ceb1677 100644
--- a/src/layer/arm/hardswish_arm.h
+++ b/src/layer/arm/hardswish_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSwish_arm : virtual public HardSwish
+class HardSwish_arm : public HardSwish
 {
 public:
     HardSwish_arm();
diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp
index 98eda2d171b..0cbc78525eb 100644
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -46,7 +46,7 @@ InnerProduct_arm::InnerProduct_arm()
 int InnerProduct_arm::create_pipeline(const Option& opt)
 {
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
 
         ncnn::ParamDict pd;
 
@@ -122,10 +122,7 @@ int InnerProduct_arm::create_pipeline(const Option& opt)
         weight_data_tm = weight_data;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -868,10 +865,7 @@ int InnerProduct_arm::create_pipeline_bf16s(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1264,10 +1258,7 @@ int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h
index f1eee178f9c..70a54533151 100644
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_arm : virtual public InnerProduct
+class InnerProduct_arm : public InnerProduct
 {
 public:
     InnerProduct_arm();
diff --git a/src/layer/arm/innerproduct_arm_vfpv4.cpp b/src/layer/arm/innerproduct_arm_vfpv4.cpp
index 435fb883e50..6a6eab84fba 100644
--- a/src/layer/arm/innerproduct_arm_vfpv4.cpp
+++ b/src/layer/arm/innerproduct_arm_vfpv4.cpp
@@ -41,10 +41,7 @@ int InnerProduct_arm::create_pipeline_fp16s(const Option& opt)
     }
 #endif
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/instancenorm_arm.h b/src/layer/arm/instancenorm_arm.h
index 102c49fe2b0..98dec71ac48 100644
--- a/src/layer/arm/instancenorm_arm.h
+++ b/src/layer/arm/instancenorm_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InstanceNorm_arm : virtual public InstanceNorm
+class InstanceNorm_arm : public InstanceNorm
 {
 public:
     InstanceNorm_arm();
diff --git a/src/layer/arm/interp_arm.h b/src/layer/arm/interp_arm.h
index 5ea9873ae78..6c15c73801b 100644
--- a/src/layer/arm/interp_arm.h
+++ b/src/layer/arm/interp_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_arm : virtual public Interp
+class Interp_arm : public Interp
 {
 public:
     Interp_arm();
diff --git a/src/layer/arm/lrn_arm.h b/src/layer/arm/lrn_arm.h
index db9a04e0adb..f2c43ba08f2 100644
--- a/src/layer/arm/lrn_arm.h
+++ b/src/layer/arm/lrn_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LRN_arm : virtual public LRN
+class LRN_arm : public LRN
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/arm/lstm_arm.cpp b/src/layer/arm/lstm_arm.cpp
index 04d7277547e..b8d5afe93dc 100644
--- a/src/layer/arm/lstm_arm.cpp
+++ b/src/layer/arm/lstm_arm.cpp
@@ -124,12 +124,9 @@ int LSTM_arm::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_xc_data.release();
-        bias_c_data.release();
-        weight_hc_data.release();
-    }
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
 
     return 0;
 }
@@ -931,12 +928,9 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_xc_data.release();
-        bias_c_data.release();
-        weight_hc_data.release();
-    }
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/lstm_arm.h b/src/layer/arm/lstm_arm.h
index a42dff28823..b5ee1092a52 100644
--- a/src/layer/arm/lstm_arm.h
+++ b/src/layer/arm/lstm_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LSTM_arm : virtual public LSTM
+class LSTM_arm : public LSTM
 {
 public:
     LSTM_arm();
diff --git a/src/layer/arm/lstm_arm_asimdhp.cpp b/src/layer/arm/lstm_arm_asimdhp.cpp
index 1d3fc71cdfc..593af33ccd4 100644
--- a/src/layer/arm/lstm_arm_asimdhp.cpp
+++ b/src/layer/arm/lstm_arm_asimdhp.cpp
@@ -835,12 +835,9 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_xc_data.release();
-        bias_c_data.release();
-        weight_hc_data.release();
-    }
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/matmul_arm.cpp b/src/layer/arm/matmul_arm.cpp
index 7117ce49511..363ab4490bb 100644
--- a/src/layer/arm/matmul_arm.cpp
+++ b/src/layer/arm/matmul_arm.cpp
@@ -37,7 +37,7 @@ MatMul_arm::MatMul_arm()
 
 int MatMul_arm::create_pipeline(const Option& opt)
 {
-    gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+    gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
     ncnn::ParamDict pd;
     pd.set(2, 0);      // transA
diff --git a/src/layer/arm/matmul_arm.h b/src/layer/arm/matmul_arm.h
index 4d4784ce50d..a4537300d5a 100644
--- a/src/layer/arm/matmul_arm.h
+++ b/src/layer/arm/matmul_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MatMul_arm : virtual public MatMul
+class MatMul_arm : public MatMul
 {
 public:
     MatMul_arm();
diff --git a/src/layer/arm/mish_arm.h b/src/layer/arm/mish_arm.h
index 708611589f4..9f99a7a1200 100644
--- a/src/layer/arm/mish_arm.h
+++ b/src/layer/arm/mish_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_arm : virtual public Mish
+class Mish_arm : public Mish
 {
 public:
     Mish_arm();
diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp
index 15eca715699..b3f3d7aa8e7 100644
--- a/src/layer/arm/multiheadattention_arm.cpp
+++ b/src/layer/arm/multiheadattention_arm.cpp
@@ -48,7 +48,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
     opt.use_bf16_storage &= support_bf16_storage;
 
     {
-        qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+        qk_softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
         ncnn::ParamDict pd;
         pd.set(0, -1);
         pd.set(1, 1);
@@ -61,7 +61,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
         const int embed_dim_per_head = embed_dim / num_heads;
         const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
 
-        q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        q_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(0, inv_sqrt_embed_dim_per_head);
         pd.set(1, 1.f);
@@ -84,15 +84,12 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
         q_gemm->load_model(ModelBinFromMatArray(weights));
         q_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            q_weight_data.release();
-            q_bias_data.release();
-        }
+        q_weight_data.release();
+        q_bias_data.release();
     }
 
     {
-        k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        k_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
@@ -113,15 +110,12 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
         k_gemm->load_model(ModelBinFromMatArray(weights));
         k_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            k_weight_data.release();
-            k_bias_data.release();
-        }
+        k_weight_data.release();
+        k_bias_data.release();
     }
 
     {
-        v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        v_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
@@ -142,15 +136,12 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
         v_gemm->load_model(ModelBinFromMatArray(weights));
         v_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            v_weight_data.release();
-            v_bias_data.release();
-        }
+        v_weight_data.release();
+        v_bias_data.release();
     }
 
     {
-        o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        o_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 1);         // transA
         pd.set(3, 1);         // transB
@@ -169,15 +160,12 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
         o_gemm->load_model(ModelBinFromMatArray(weights));
         o_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            out_weight_data.release();
-            out_bias_data.release();
-        }
+        out_weight_data.release();
+        out_bias_data.release();
     }
 
     {
-        qk_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        qk_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 1);                   // transA
         pd.set(3, 0);                   // transB
@@ -198,7 +186,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
     }
 
     {
-        qkv_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        qkv_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);   // transA
         pd.set(3, 1);   // transB
diff --git a/src/layer/arm/multiheadattention_arm.h b/src/layer/arm/multiheadattention_arm.h
index fb1010b1b01..f1b721f22ea 100644
--- a/src/layer/arm/multiheadattention_arm.h
+++ b/src/layer/arm/multiheadattention_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MultiHeadAttention_arm : virtual public MultiHeadAttention
+class MultiHeadAttention_arm : public MultiHeadAttention
 {
 public:
     MultiHeadAttention_arm();
diff --git a/src/layer/arm/packing_arm.h b/src/layer/arm/packing_arm.h
index 20cb04ac5f3..17c64854058 100644
--- a/src/layer/arm/packing_arm.h
+++ b/src/layer/arm/packing_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_arm : virtual public Packing
+class Packing_arm : public Packing
 {
 public:
     Packing_arm();
diff --git a/src/layer/arm/padding_arm.h b/src/layer/arm/padding_arm.h
index 81156fcd831..164cfe4c33a 100644
--- a/src/layer/arm/padding_arm.h
+++ b/src/layer/arm/padding_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_arm : virtual public Padding
+class Padding_arm : public Padding
 {
 public:
     Padding_arm();
diff --git a/src/layer/arm/pixelshuffle_arm.h b/src/layer/arm/pixelshuffle_arm.h
index c40d67ddec8..a2d714c9ebb 100644
--- a/src/layer/arm/pixelshuffle_arm.h
+++ b/src/layer/arm/pixelshuffle_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PixelShuffle_arm : virtual public PixelShuffle
+class PixelShuffle_arm : public PixelShuffle
 {
 public:
     PixelShuffle_arm();
diff --git a/src/layer/arm/pooling_arm.h b/src/layer/arm/pooling_arm.h
index 0193faa6a87..ead9270c717 100644
--- a/src/layer/arm/pooling_arm.h
+++ b/src/layer/arm/pooling_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Pooling_arm : virtual public Pooling
+class Pooling_arm : public Pooling
 {
 public:
     Pooling_arm();
diff --git a/src/layer/arm/prelu_arm.h b/src/layer/arm/prelu_arm.h
index e65801a3be0..9354be7440b 100644
--- a/src/layer/arm/prelu_arm.h
+++ b/src/layer/arm/prelu_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_arm : virtual public PReLU
+class PReLU_arm : public PReLU
 {
 public:
     PReLU_arm();
diff --git a/src/layer/arm/quantize_arm.h b/src/layer/arm/quantize_arm.h
index 3ed271ca7fe..60a716198cb 100644
--- a/src/layer/arm/quantize_arm.h
+++ b/src/layer/arm/quantize_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Quantize_arm : virtual public Quantize
+class Quantize_arm : public Quantize
 {
 public:
     Quantize_arm();
diff --git a/src/layer/arm/relu_arm.h b/src/layer/arm/relu_arm.h
index 77bda6ac5b5..c2212513a42 100644
--- a/src/layer/arm/relu_arm.h
+++ b/src/layer/arm/relu_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_arm : virtual public ReLU
+class ReLU_arm : public ReLU
 {
 public:
     ReLU_arm();
diff --git a/src/layer/arm/requantize_arm.h b/src/layer/arm/requantize_arm.h
index e7093a7e4c1..c6fc184a018 100644
--- a/src/layer/arm/requantize_arm.h
+++ b/src/layer/arm/requantize_arm.h
@@ -20,7 +20,7 @@
 
 namespace ncnn {
 
-class Requantize_arm : virtual public Requantize
+class Requantize_arm : public Requantize
 {
 public:
     Requantize_arm();
diff --git a/src/layer/arm/reshape_arm.h b/src/layer/arm/reshape_arm.h
index 7a2474b7cb5..85466ecfd68 100644
--- a/src/layer/arm/reshape_arm.h
+++ b/src/layer/arm/reshape_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Reshape_arm : virtual public Reshape
+class Reshape_arm : public Reshape
 {
 public:
     Reshape_arm();
diff --git a/src/layer/arm/rnn_arm.cpp b/src/layer/arm/rnn_arm.cpp
index 19f439ea2d5..15b9f0b8a0d 100644
--- a/src/layer/arm/rnn_arm.cpp
+++ b/src/layer/arm/rnn_arm.cpp
@@ -139,6 +139,10 @@ int RNN_arm::create_pipeline(const Option& opt)
 
     bias_c_data_packed = bias_c_data;
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
@@ -732,6 +736,10 @@ int RNN_arm::create_pipeline_bf16s(const Option& opt)
 
     cast_float32_to_bfloat16(bias_c_data, bias_c_data_packed, opt);
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/rnn_arm.h b/src/layer/arm/rnn_arm.h
index 5defad4cf08..18e75642b9e 100644
--- a/src/layer/arm/rnn_arm.h
+++ b/src/layer/arm/rnn_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class RNN_arm : virtual public RNN
+class RNN_arm : public RNN
 {
 public:
     RNN_arm();
diff --git a/src/layer/arm/rnn_arm_asimdhp.cpp b/src/layer/arm/rnn_arm_asimdhp.cpp
index c34b3e8bb48..467dba614f8 100644
--- a/src/layer/arm/rnn_arm_asimdhp.cpp
+++ b/src/layer/arm/rnn_arm_asimdhp.cpp
@@ -517,6 +517,10 @@ int RNN_arm::create_pipeline_fp16s(const Option& opt)
 
     cast_float32_to_float16(bias_c_data, bias_c_data_packed, opt);
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/scale_arm.h b/src/layer/arm/scale_arm.h
index c327376d17e..c540cdd62ed 100644
--- a/src/layer/arm/scale_arm.h
+++ b/src/layer/arm/scale_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Scale_arm : virtual public Scale
+class Scale_arm : public Scale
 {
 public:
     Scale_arm();
diff --git a/src/layer/arm/selu_arm.h b/src/layer/arm/selu_arm.h
index ad0bdf2f955..d951804db68 100644
--- a/src/layer/arm/selu_arm.h
+++ b/src/layer/arm/selu_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class SELU_arm : virtual public SELU
+class SELU_arm : public SELU
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/arm/shufflechannel_arm.h b/src/layer/arm/shufflechannel_arm.h
index f7a32ac4ab7..dcdbf760bb3 100644
--- a/src/layer/arm/shufflechannel_arm.h
+++ b/src/layer/arm/shufflechannel_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ShuffleChannel_arm : virtual public ShuffleChannel
+class ShuffleChannel_arm : public ShuffleChannel
 {
 public:
     ShuffleChannel_arm();
diff --git a/src/layer/arm/sigmoid_arm.h b/src/layer/arm/sigmoid_arm.h
index f532a44d6f5..4c3901abbe9 100644
--- a/src/layer/arm/sigmoid_arm.h
+++ b/src/layer/arm/sigmoid_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_arm : virtual public Sigmoid
+class Sigmoid_arm : public Sigmoid
 {
 public:
     Sigmoid_arm();
diff --git a/src/layer/arm/slice_arm.h b/src/layer/arm/slice_arm.h
index 50da56743b2..c3b558b9e1d 100644
--- a/src/layer/arm/slice_arm.h
+++ b/src/layer/arm/slice_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Slice_arm : virtual public Slice
+class Slice_arm : public Slice
 {
 public:
     Slice_arm();
diff --git a/src/layer/arm/softmax_arm.h b/src/layer/arm/softmax_arm.h
index fced6398c54..78c540845b0 100644
--- a/src/layer/arm/softmax_arm.h
+++ b/src/layer/arm/softmax_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_arm : virtual public Softmax
+class Softmax_arm : public Softmax
 {
 public:
     Softmax_arm();
diff --git a/src/layer/arm/softmax_arm_asimdhp.cpp b/src/layer/arm/softmax_arm_asimdhp.cpp
index 844e32ce908..3ef14a34acb 100644
--- a/src/layer/arm/softmax_arm_asimdhp.cpp
+++ b/src/layer/arm/softmax_arm_asimdhp.cpp
@@ -255,7 +255,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x8_t _ss01 = vpaddq_f16(_p0, _p1);
                     float16x8_t _ss23 = vpaddq_f16(_p2, _p3);
                     float16x8_t _ss2 = vpaddq_f16(_ss01, _ss23);
-                    _sum = vadd_f16(_sum, vpmax_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
+                    _sum = vadd_f16(_sum, vpadd_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
                     vst1_f16(sumptr, _sum);
                     ptr += 32;
                     maxptr += 4;
@@ -292,7 +292,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     vst1q_f16(ptr, _p0);
                     vst1q_f16(ptr + 8, _p1);
                     float16x8_t _ss2 = vpaddq_f16(_p0, _p1);
-                    _sum = vadd_f16(_sum, vpmax_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
+                    _sum = vadd_f16(_sum, vpadd_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
                     vst1_f16(sumptr, _sum);
                     ptr += 16;
                     maxptr += 4;
@@ -743,7 +743,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x8_t _ss01 = vpaddq_f16(_p0, _p1);
                     float16x8_t _ss23 = vpaddq_f16(_p2, _p3);
                     float16x8_t _ss2 = vpaddq_f16(_ss01, _ss23);
-                    _sum = vadd_f16(_sum, vpmax_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
+                    _sum = vadd_f16(_sum, vpadd_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
                     vst1_f16(sumptr, _sum);
                     ptr += 32;
                     sumptr += 4;
@@ -768,7 +768,7 @@ int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     float16x8_t _p1 = vld1q_f16(ptr + 8);
                     float16x4_t _sum = vld1_f16(sumptr);
                     float16x8_t _ss2 = vpaddq_f16(_p0, _p1);
-                    _sum = vadd_f16(_sum, vpmax_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
+                    _sum = vadd_f16(_sum, vpadd_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
                     vst1_f16(sumptr, _sum);
                     ptr += 16;
                     sumptr += 4;
diff --git a/src/layer/arm/swish_arm.h b/src/layer/arm/swish_arm.h
index ac24757c397..907d79708ab 100644
--- a/src/layer/arm/swish_arm.h
+++ b/src/layer/arm/swish_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_arm : virtual public Swish
+class Swish_arm : public Swish
 {
 public:
     Swish_arm();
diff --git a/src/layer/arm/tanh_arm.h b/src/layer/arm/tanh_arm.h
index e019b32ec4f..db62f117a56 100644
--- a/src/layer/arm/tanh_arm.h
+++ b/src/layer/arm/tanh_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_arm : virtual public TanH
+class TanH_arm : public TanH
 {
 public:
     TanH_arm();
diff --git a/src/layer/arm/unaryop_arm.h b/src/layer/arm/unaryop_arm.h
index 66994eb2103..ab4b23c05f1 100644
--- a/src/layer/arm/unaryop_arm.h
+++ b/src/layer/arm/unaryop_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_arm : virtual public UnaryOp
+class UnaryOp_arm : public UnaryOp
 {
 public:
     UnaryOp_arm();
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index 4acf91869ae..fe025456f48 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -95,17 +95,9 @@ int Convolution::load_model(const ModelBin& mb)
     }
 #endif // NCNN_INT8
 
-    return 0;
-}
-
-int Convolution::create_pipeline(const Option& opt)
-{
-    if (dynamic_weight)
-        return 0;
-
 #if NCNN_INT8
     // runtime quantize the weight data
-    if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
+    if (weight_data.elemsize == (size_t)4u && int8_scale_term)
     {
         const int maxk = kernel_w * kernel_h;
         const int num_input = weight_data_size / num_output / maxk;
@@ -114,7 +106,8 @@ int Convolution::create_pipeline(const Option& opt)
 
         Mat weight_data_int8;
 
-        Option opt_q = opt;
+        Option opt_q;
+        opt_q.num_threads = 1;
         opt_q.blob_allocator = weight_data.allocator;
         opt_q.use_packing_layout = false;
         quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
@@ -123,8 +116,6 @@ int Convolution::create_pipeline(const Option& opt)
 
         weight_data = weight_data_int8.reshape(weight_data_size);
     }
-#else
-    (void)(opt);
 #endif // NCNN_INT8
 
     return 0;
@@ -219,7 +210,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         if (bottom_blob.w * bottom_blob.elempack == num_input)
         {
             // call InnerProduct
-            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::InnerProduct);
+            ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::InnerProduct);
 
             // set param
             ncnn::ParamDict pd;
diff --git a/src/layer/convolution.h b/src/layer/convolution.h
index 476a7aaf67b..7af0735fd30 100644
--- a/src/layer/convolution.h
+++ b/src/layer/convolution.h
@@ -28,8 +28,6 @@ class Convolution : public Layer
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int create_pipeline(const Option& opt);
-
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
diff --git a/src/layer/convolution1d.cpp b/src/layer/convolution1d.cpp
index 184b2bdb60d..7d6be1e111e 100644
--- a/src/layer/convolution1d.cpp
+++ b/src/layer/convolution1d.cpp
@@ -67,14 +67,6 @@ int Convolution1D::load_model(const ModelBin& mb)
     return 0;
 }
 
-int Convolution1D::create_pipeline(const Option&)
-{
-    if (dynamic_weight)
-        return 0;
-
-    return 0;
-}
-
 static int convolution1d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int stride_w, int dilation_w, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int h = bottom_blob.h;
diff --git a/src/layer/convolution1d.h b/src/layer/convolution1d.h
index e30807e5c9b..d87099e25f2 100644
--- a/src/layer/convolution1d.h
+++ b/src/layer/convolution1d.h
@@ -28,8 +28,6 @@ class Convolution1D : public Layer
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int create_pipeline(const Option& opt);
-
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp
index e820a192cb3..fb8e1e5c0b2 100644
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -124,14 +124,9 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
     }
 #endif // NCNN_INT8
 
-    return 0;
-}
-
-int ConvolutionDepthWise::create_pipeline(const Option& opt)
-{
 #if NCNN_INT8
     // runtime quantize the weight data
-    if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
+    if (weight_data.elemsize == (size_t)4u && int8_scale_term)
     {
         Mat int8_weight_data(weight_data_size, (size_t)1u);
         if (int8_weight_data.empty())
@@ -141,7 +136,8 @@ int ConvolutionDepthWise::create_pipeline(const Option& opt)
 
         for (int g = 0; g < group; g++)
         {
-            Option opt_q = opt;
+            Option opt_q;
+            opt_q.num_threads = 1;
             opt_q.blob_allocator = int8_weight_data.allocator;
             opt_q.use_packing_layout = false;
 
@@ -153,8 +149,6 @@ int ConvolutionDepthWise::create_pipeline(const Option& opt)
 
         weight_data = int8_weight_data;
     }
-#else
-    (void)(opt);
 #endif // NCNN_INT8
 
     return 0;
diff --git a/src/layer/convolutiondepthwise.h b/src/layer/convolutiondepthwise.h
index e893aa07fc9..8a955dbd23b 100644
--- a/src/layer/convolutiondepthwise.h
+++ b/src/layer/convolutiondepthwise.h
@@ -28,8 +28,6 @@ class ConvolutionDepthWise : public Layer
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int create_pipeline(const Option& opt);
-
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
diff --git a/src/layer/convolutiondepthwise1d.cpp b/src/layer/convolutiondepthwise1d.cpp
index 79c83168051..2ace80658e7 100644
--- a/src/layer/convolutiondepthwise1d.cpp
+++ b/src/layer/convolutiondepthwise1d.cpp
@@ -59,6 +59,9 @@ int ConvolutionDepthWise1D::load_param(const ParamDict& pd)
 
 int ConvolutionDepthWise1D::load_model(const ModelBin& mb)
 {
+    if (dynamic_weight)
+        return 0;
+
     weight_data = mb.load(weight_data_size, 0);
     if (weight_data.empty())
         return -100;
@@ -73,11 +76,6 @@ int ConvolutionDepthWise1D::load_model(const ModelBin& mb)
     return 0;
 }
 
-int ConvolutionDepthWise1D::create_pipeline(const Option&)
-{
-    return 0;
-}
-
 static int convolutiondepthwise1d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int stride_w, int dilation_w, int group, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int h = bottom_blob.h;
diff --git a/src/layer/convolutiondepthwise1d.h b/src/layer/convolutiondepthwise1d.h
index e2c195dc489..6026f04981d 100644
--- a/src/layer/convolutiondepthwise1d.h
+++ b/src/layer/convolutiondepthwise1d.h
@@ -28,8 +28,6 @@ class ConvolutionDepthWise1D : public Layer
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int create_pipeline(const Option& opt);
-
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
diff --git a/src/layer/fused_activation.h b/src/layer/fused_activation.h
index 275fd9e2f9a..d5919257792 100644
--- a/src/layer/fused_activation.h
+++ b/src/layer/fused_activation.h
@@ -80,14 +80,14 @@ static ncnn::Layer* create_activation_layer(int activation_type, const ncnn::Mat
 
     if (activation_type == 1)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::ReLU);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::ReLU);
 
         ncnn::ParamDict pd;
         activation->load_param(pd);
     }
     else if (activation_type == 2)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::ReLU);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::ReLU);
 
         ncnn::ParamDict pd;
         pd.set(0, activation_params[0]); // slope
@@ -95,7 +95,7 @@ static ncnn::Layer* create_activation_layer(int activation_type, const ncnn::Mat
     }
     else if (activation_type == 3)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::Clip);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::Clip);
 
         ncnn::ParamDict pd;
         pd.set(0, activation_params[0]); // min
@@ -105,21 +105,21 @@ static ncnn::Layer* create_activation_layer(int activation_type, const ncnn::Mat
     }
     else if (activation_type == 4)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::Sigmoid);
 
         ncnn::ParamDict pd;
         activation->load_param(pd);
     }
     else if (activation_type == 5)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::Mish);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::Mish);
 
         ncnn::ParamDict pd;
         activation->load_param(pd);
     }
     else if (activation_type == 6)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::HardSwish);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::HardSwish);
 
         ncnn::ParamDict pd;
         pd.set(0, activation_params[0]); // alpha
diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp
index 4cc22981c34..9cb422d21b6 100644
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -69,21 +69,17 @@ int InnerProduct::load_model(const ModelBin& mb)
     }
 #endif // NCNN_INT8
 
-    return 0;
-}
-
-int InnerProduct::create_pipeline(const Option& opt)
-{
 #if NCNN_INT8
     // runtime quantize the weight data
-    if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
+    if (weight_data.elemsize == (size_t)4u && int8_scale_term)
     {
         const int num_input = weight_data_size / num_output;
 
         Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
 
         Mat weight_data_int8;
-        Option opt_q = opt;
+        Option opt_q;
+        opt_q.num_threads = 1;
         opt_q.use_packing_layout = false;
         quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
         if (weight_data_int8.empty())
@@ -91,8 +87,6 @@ int InnerProduct::create_pipeline(const Option& opt)
 
         weight_data = weight_data_int8.reshape(weight_data_size);
     }
-#else
-    (void)(opt);
 #endif // NCNN_INT8
 
     return 0;
diff --git a/src/layer/innerproduct.h b/src/layer/innerproduct.h
index 1f9b3fdc0a5..becf7b1d01a 100644
--- a/src/layer/innerproduct.h
+++ b/src/layer/innerproduct.h
@@ -28,8 +28,6 @@ class InnerProduct : public Layer
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int create_pipeline(const Option& opt);
-
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 protected:
diff --git a/src/layer/loongarch/absval_loongarch.h b/src/layer/loongarch/absval_loongarch.h
index 0a3143cea43..855f959cf00 100644
--- a/src/layer/loongarch/absval_loongarch.h
+++ b/src/layer/loongarch/absval_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class AbsVal_loongarch : virtual public AbsVal
+class AbsVal_loongarch : public AbsVal
 {
 public:
     AbsVal_loongarch();
diff --git a/src/layer/loongarch/batchnorm_loongarch.h b/src/layer/loongarch/batchnorm_loongarch.h
index 8b38d5e1f66..fb477a9aedb 100644
--- a/src/layer/loongarch/batchnorm_loongarch.h
+++ b/src/layer/loongarch/batchnorm_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BatchNorm_loongarch : virtual public BatchNorm
+class BatchNorm_loongarch : public BatchNorm
 {
 public:
     BatchNorm_loongarch();
diff --git a/src/layer/loongarch/bias_loongarch.h b/src/layer/loongarch/bias_loongarch.h
index f122ffa0dd9..35824997487 100644
--- a/src/layer/loongarch/bias_loongarch.h
+++ b/src/layer/loongarch/bias_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Bias_loongarch : virtual public Bias
+class Bias_loongarch : public Bias
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/loongarch/binaryop_loongarch.h b/src/layer/loongarch/binaryop_loongarch.h
index bcf9ef5442f..2fc401ad610 100644
--- a/src/layer/loongarch/binaryop_loongarch.h
+++ b/src/layer/loongarch/binaryop_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BinaryOp_loongarch : virtual public BinaryOp
+class BinaryOp_loongarch : public BinaryOp
 {
 public:
     BinaryOp_loongarch();
diff --git a/src/layer/loongarch/cast_loongarch.h b/src/layer/loongarch/cast_loongarch.h
index 1fe75c687d8..8925f242ed5 100644
--- a/src/layer/loongarch/cast_loongarch.h
+++ b/src/layer/loongarch/cast_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_loongarch : virtual public Cast
+class Cast_loongarch : public Cast
 {
 public:
     Cast_loongarch();
diff --git a/src/layer/loongarch/clip_loongarch.h b/src/layer/loongarch/clip_loongarch.h
index 43df62035ff..1ebeee2aeac 100644
--- a/src/layer/loongarch/clip_loongarch.h
+++ b/src/layer/loongarch/clip_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_loongarch : virtual public Clip
+class Clip_loongarch : public Clip
 {
 public:
     Clip_loongarch();
diff --git a/src/layer/loongarch/concat_loongarch.h b/src/layer/loongarch/concat_loongarch.h
index 934c85244df..91b32ef2faf 100644
--- a/src/layer/loongarch/concat_loongarch.h
+++ b/src/layer/loongarch/concat_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_loongarch : virtual public Concat
+class Concat_loongarch : public Concat
 {
 public:
     Concat_loongarch();
diff --git a/src/layer/loongarch/convolution1d_loongarch.cpp b/src/layer/loongarch/convolution1d_loongarch.cpp
index 0b1a11c868f..0917a79f62e 100644
--- a/src/layer/loongarch/convolution1d_loongarch.cpp
+++ b/src/layer/loongarch/convolution1d_loongarch.cpp
@@ -342,7 +342,7 @@ int Convolution1D_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/loongarch/convolution1d_loongarch.h b/src/layer/loongarch/convolution1d_loongarch.h
index 36393df4568..922fae598f4 100644
--- a/src/layer/loongarch/convolution1d_loongarch.h
+++ b/src/layer/loongarch/convolution1d_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_loongarch : virtual public Convolution1D
+class Convolution1D_loongarch : public Convolution1D
 {
 public:
     Convolution1D_loongarch();
diff --git a/src/layer/loongarch/convolution_loongarch.cpp b/src/layer/loongarch/convolution_loongarch.cpp
index 7816d1c66d2..3c5d0c1a424 100644
--- a/src/layer/loongarch/convolution_loongarch.cpp
+++ b/src/layer/loongarch/convolution_loongarch.cpp
@@ -225,10 +225,7 @@ int Convolution_loongarch::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -593,7 +590,7 @@ int Convolution_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::ve
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -792,10 +789,7 @@ int Convolution_loongarch::create_pipeline_int8_loongarch(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/loongarch/convolution_loongarch.h b/src/layer/loongarch/convolution_loongarch.h
index a84281bf713..7807f43f9f1 100644
--- a/src/layer/loongarch/convolution_loongarch.h
+++ b/src/layer/loongarch/convolution_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_loongarch : virtual public Convolution
+class Convolution_loongarch : public Convolution
 {
 public:
     Convolution_loongarch();
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
index 4d134cc4a39..0c5050dbce0 100644
--- a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
@@ -83,10 +83,7 @@ int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
             weight_data_tm = weight_data;
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -94,10 +91,7 @@ int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -125,7 +119,7 @@ int ConvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -537,7 +531,7 @@ int ConvolutionDepthWise_loongarch::forward(const std::vector<Mat>& bottom_blobs
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -606,16 +600,15 @@ int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option&
             weight_data_tm = weight_data;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.h b/src/layer/loongarch/convolutiondepthwise_loongarch.h
index 554fe764304..35cdd8f008d 100644
--- a/src/layer/loongarch/convolutiondepthwise_loongarch.h
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_loongarch : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_loongarch : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_loongarch();
diff --git a/src/layer/loongarch/crop_loongarch.h b/src/layer/loongarch/crop_loongarch.h
index 0ba460256d6..cfb4ff352ba 100644
--- a/src/layer/loongarch/crop_loongarch.h
+++ b/src/layer/loongarch/crop_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_loongarch : virtual public Crop
+class Crop_loongarch : public Crop
 {
 public:
     Crop_loongarch();
diff --git a/src/layer/loongarch/deconvolution_loongarch.cpp b/src/layer/loongarch/deconvolution_loongarch.cpp
index 2d934bccb06..62b9d872b60 100644
--- a/src/layer/loongarch/deconvolution_loongarch.cpp
+++ b/src/layer/loongarch/deconvolution_loongarch.cpp
@@ -126,10 +126,7 @@ int Deconvolution_loongarch::create_pipeline(const Option& opt)
     {
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -348,7 +345,7 @@ int Deconvolution_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/loongarch/deconvolution_loongarch.h b/src/layer/loongarch/deconvolution_loongarch.h
index f67b5d7e4e1..00ddf67e05b 100644
--- a/src/layer/loongarch/deconvolution_loongarch.h
+++ b/src/layer/loongarch/deconvolution_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_loongarch : virtual public Deconvolution
+class Deconvolution_loongarch : public Deconvolution
 {
 public:
     Deconvolution_loongarch();
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
index f4f4d37bf7f..9495a99aae0 100644
--- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
@@ -82,16 +82,15 @@ int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
             weight_data_tm = weight_data_transposed;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -119,7 +118,7 @@ int DeconvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -476,7 +475,7 @@ int DeconvolutionDepthWise_loongarch::forward(const std::vector<Mat>& bottom_blo
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
index b710f07ecf3..87c5351fab4 100644
--- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_loongarch : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_loongarch : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_loongarch();
diff --git a/src/layer/loongarch/dequantize_loongarch.h b/src/layer/loongarch/dequantize_loongarch.h
index 61a408d5c50..ae7d3fe6479 100644
--- a/src/layer/loongarch/dequantize_loongarch.h
+++ b/src/layer/loongarch/dequantize_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dequantize_loongarch : virtual public Dequantize
+class Dequantize_loongarch : public Dequantize
 {
 public:
     Dequantize_loongarch();
diff --git a/src/layer/loongarch/dropout_loongarch.h b/src/layer/loongarch/dropout_loongarch.h
index 42810050677..f9beff05034 100644
--- a/src/layer/loongarch/dropout_loongarch.h
+++ b/src/layer/loongarch/dropout_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dropout_loongarch : virtual public Dropout
+class Dropout_loongarch : public Dropout
 {
 public:
     Dropout_loongarch();
diff --git a/src/layer/loongarch/eltwise_loongarch.h b/src/layer/loongarch/eltwise_loongarch.h
index f9715b20cad..f523132bb5f 100644
--- a/src/layer/loongarch/eltwise_loongarch.h
+++ b/src/layer/loongarch/eltwise_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Eltwise_loongarch : virtual public Eltwise
+class Eltwise_loongarch : public Eltwise
 {
 public:
     Eltwise_loongarch();
diff --git a/src/layer/loongarch/flatten_loongarch.h b/src/layer/loongarch/flatten_loongarch.h
index afd35c701f5..da75fd12f3f 100644
--- a/src/layer/loongarch/flatten_loongarch.h
+++ b/src/layer/loongarch/flatten_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_loongarch : virtual public Flatten
+class Flatten_loongarch : public Flatten
 {
 public:
     Flatten_loongarch();
diff --git a/src/layer/loongarch/hardsigmoid_loongarch.h b/src/layer/loongarch/hardsigmoid_loongarch.h
index 755ae89ff03..519a4ba9594 100644
--- a/src/layer/loongarch/hardsigmoid_loongarch.h
+++ b/src/layer/loongarch/hardsigmoid_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_loongarch : virtual public HardSigmoid
+class HardSigmoid_loongarch : public HardSigmoid
 {
 public:
     HardSigmoid_loongarch();
diff --git a/src/layer/loongarch/hardswish_loongarch.h b/src/layer/loongarch/hardswish_loongarch.h
index e9b0821245c..ef69cb05417 100644
--- a/src/layer/loongarch/hardswish_loongarch.h
+++ b/src/layer/loongarch/hardswish_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSwish_loongarch : virtual public HardSwish
+class HardSwish_loongarch : public HardSwish
 {
 public:
     HardSwish_loongarch();
diff --git a/src/layer/loongarch/innerproduct_loongarch.cpp b/src/layer/loongarch/innerproduct_loongarch.cpp
index 34e908fc11a..e6b8eb0936b 100644
--- a/src/layer/loongarch/innerproduct_loongarch.cpp
+++ b/src/layer/loongarch/innerproduct_loongarch.cpp
@@ -37,7 +37,7 @@ InnerProduct_loongarch::InnerProduct_loongarch()
 int InnerProduct_loongarch::create_pipeline(const Option& opt)
 {
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
 
         ncnn::ParamDict pd;
 
@@ -99,10 +99,7 @@ int InnerProduct_loongarch::create_pipeline(const Option& opt)
         weight_data_tm = weight_data;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -655,10 +652,7 @@ int InnerProduct_loongarch::create_pipeline_fp16s(const Option& opt)
         ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1146,10 +1140,7 @@ int InnerProduct_loongarch::create_pipeline_int8_loongarch(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/loongarch/innerproduct_loongarch.h b/src/layer/loongarch/innerproduct_loongarch.h
index 4d9574ce919..2ae1a1e57e0 100644
--- a/src/layer/loongarch/innerproduct_loongarch.h
+++ b/src/layer/loongarch/innerproduct_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_loongarch : virtual public InnerProduct
+class InnerProduct_loongarch : public InnerProduct
 {
 public:
     InnerProduct_loongarch();
diff --git a/src/layer/loongarch/interp_loongarch.h b/src/layer/loongarch/interp_loongarch.h
index 4c0e0f3dc86..f1fa80705d5 100644
--- a/src/layer/loongarch/interp_loongarch.h
+++ b/src/layer/loongarch/interp_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_loongarch : virtual public Interp
+class Interp_loongarch : public Interp
 {
 public:
     Interp_loongarch();
diff --git a/src/layer/loongarch/mish_loongarch.h b/src/layer/loongarch/mish_loongarch.h
index 97c6f0520f5..0c796758064 100644
--- a/src/layer/loongarch/mish_loongarch.h
+++ b/src/layer/loongarch/mish_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_loongarch : virtual public Mish
+class Mish_loongarch : public Mish
 {
 public:
     Mish_loongarch();
diff --git a/src/layer/loongarch/packing_loongarch.h b/src/layer/loongarch/packing_loongarch.h
index 1db215cfee7..476ebd33a87 100644
--- a/src/layer/loongarch/packing_loongarch.h
+++ b/src/layer/loongarch/packing_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_loongarch : virtual public Packing
+class Packing_loongarch : public Packing
 {
 public:
     Packing_loongarch();
diff --git a/src/layer/loongarch/padding_loongarch.h b/src/layer/loongarch/padding_loongarch.h
index 137fbc4459e..de416464783 100644
--- a/src/layer/loongarch/padding_loongarch.h
+++ b/src/layer/loongarch/padding_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_loongarch : virtual public Padding
+class Padding_loongarch : public Padding
 {
 public:
     Padding_loongarch();
diff --git a/src/layer/loongarch/pooling_loongarch.h b/src/layer/loongarch/pooling_loongarch.h
index 97e0c9ff2f7..646b10947b3 100644
--- a/src/layer/loongarch/pooling_loongarch.h
+++ b/src/layer/loongarch/pooling_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Pooling_loongarch : virtual public Pooling
+class Pooling_loongarch : public Pooling
 {
 public:
     Pooling_loongarch();
diff --git a/src/layer/loongarch/prelu_loongarch.h b/src/layer/loongarch/prelu_loongarch.h
index 97031bb0601..bafd7ac4c68 100644
--- a/src/layer/loongarch/prelu_loongarch.h
+++ b/src/layer/loongarch/prelu_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_loongarch : virtual public PReLU
+class PReLU_loongarch : public PReLU
 {
 public:
     PReLU_loongarch();
diff --git a/src/layer/loongarch/quantize_loongarch.h b/src/layer/loongarch/quantize_loongarch.h
index cae04aab171..dcc0d8e097e 100644
--- a/src/layer/loongarch/quantize_loongarch.h
+++ b/src/layer/loongarch/quantize_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Quantize_loongarch : virtual public Quantize
+class Quantize_loongarch : public Quantize
 {
 public:
     Quantize_loongarch();
diff --git a/src/layer/loongarch/relu_loongarch.h b/src/layer/loongarch/relu_loongarch.h
index 445c6e8febc..6ee6684fdb7 100644
--- a/src/layer/loongarch/relu_loongarch.h
+++ b/src/layer/loongarch/relu_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_loongarch : virtual public ReLU
+class ReLU_loongarch : public ReLU
 {
 public:
     ReLU_loongarch();
diff --git a/src/layer/loongarch/requantize_loongarch.h b/src/layer/loongarch/requantize_loongarch.h
index 8175989959e..4afaf9df3d3 100644
--- a/src/layer/loongarch/requantize_loongarch.h
+++ b/src/layer/loongarch/requantize_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Requantize_loongarch : virtual public Requantize
+class Requantize_loongarch : public Requantize
 {
 public:
     Requantize_loongarch();
diff --git a/src/layer/loongarch/sigmoid_loongarch.h b/src/layer/loongarch/sigmoid_loongarch.h
index b15aad235db..02354d2a5a4 100644
--- a/src/layer/loongarch/sigmoid_loongarch.h
+++ b/src/layer/loongarch/sigmoid_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_loongarch : virtual public Sigmoid
+class Sigmoid_loongarch : public Sigmoid
 {
 public:
     Sigmoid_loongarch();
diff --git a/src/layer/loongarch/slice_loongarch.h b/src/layer/loongarch/slice_loongarch.h
index b42138ba418..2f5faed8cbf 100644
--- a/src/layer/loongarch/slice_loongarch.h
+++ b/src/layer/loongarch/slice_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Slice_loongarch : virtual public Slice
+class Slice_loongarch : public Slice
 {
 public:
     Slice_loongarch();
diff --git a/src/layer/loongarch/softmax_loongarch.h b/src/layer/loongarch/softmax_loongarch.h
index 3c8272a6412..baf930fcbd2 100644
--- a/src/layer/loongarch/softmax_loongarch.h
+++ b/src/layer/loongarch/softmax_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_loongarch : virtual public Softmax
+class Softmax_loongarch : public Softmax
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/loongarch/swish_loongarch.h b/src/layer/loongarch/swish_loongarch.h
index b8d0b80f01e..9b7d2ac851f 100644
--- a/src/layer/loongarch/swish_loongarch.h
+++ b/src/layer/loongarch/swish_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_loongarch : virtual public Swish
+class Swish_loongarch : public Swish
 {
 public:
     Swish_loongarch();
diff --git a/src/layer/loongarch/tanh_loongarch.h b/src/layer/loongarch/tanh_loongarch.h
index ecbab01ec8f..74231eb56b6 100644
--- a/src/layer/loongarch/tanh_loongarch.h
+++ b/src/layer/loongarch/tanh_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_loongarch : virtual public TanH
+class TanH_loongarch : public TanH
 {
 public:
     TanH_loongarch();
diff --git a/src/layer/loongarch/unaryop_loongarch.h b/src/layer/loongarch/unaryop_loongarch.h
index 8170bec50cf..f4210aeab57 100644
--- a/src/layer/loongarch/unaryop_loongarch.h
+++ b/src/layer/loongarch/unaryop_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_loongarch : virtual public UnaryOp
+class UnaryOp_loongarch : public UnaryOp
 {
 public:
     UnaryOp_loongarch();
diff --git a/src/layer/mips/absval_mips.h b/src/layer/mips/absval_mips.h
index c028c312f35..95dca4d596a 100644
--- a/src/layer/mips/absval_mips.h
+++ b/src/layer/mips/absval_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class AbsVal_mips : virtual public AbsVal
+class AbsVal_mips : public AbsVal
 {
 public:
     AbsVal_mips();
diff --git a/src/layer/mips/batchnorm_mips.h b/src/layer/mips/batchnorm_mips.h
index c18902ebad7..6df49407a0e 100644
--- a/src/layer/mips/batchnorm_mips.h
+++ b/src/layer/mips/batchnorm_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BatchNorm_mips : virtual public BatchNorm
+class BatchNorm_mips : public BatchNorm
 {
 public:
     BatchNorm_mips();
diff --git a/src/layer/mips/bias_mips.h b/src/layer/mips/bias_mips.h
index 3757c0b421e..dfef2159b4d 100644
--- a/src/layer/mips/bias_mips.h
+++ b/src/layer/mips/bias_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Bias_mips : virtual public Bias
+class Bias_mips : public Bias
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/mips/binaryop_mips.h b/src/layer/mips/binaryop_mips.h
index 55d0f2cf363..e682373ba56 100644
--- a/src/layer/mips/binaryop_mips.h
+++ b/src/layer/mips/binaryop_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BinaryOp_mips : virtual public BinaryOp
+class BinaryOp_mips : public BinaryOp
 {
 public:
     BinaryOp_mips();
diff --git a/src/layer/mips/cast_mips.h b/src/layer/mips/cast_mips.h
index e37374bda6c..adabee5f888 100644
--- a/src/layer/mips/cast_mips.h
+++ b/src/layer/mips/cast_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_mips : virtual public Cast
+class Cast_mips : public Cast
 {
 public:
     Cast_mips();
diff --git a/src/layer/mips/clip_mips.h b/src/layer/mips/clip_mips.h
index 951888e0562..5db94bc5454 100644
--- a/src/layer/mips/clip_mips.h
+++ b/src/layer/mips/clip_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_mips : virtual public Clip
+class Clip_mips : public Clip
 {
 public:
     Clip_mips();
diff --git a/src/layer/mips/concat_mips.h b/src/layer/mips/concat_mips.h
index 994ca85cf3b..c4ab84f3037 100644
--- a/src/layer/mips/concat_mips.h
+++ b/src/layer/mips/concat_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_mips : virtual public Concat
+class Concat_mips : public Concat
 {
 public:
     Concat_mips();
diff --git a/src/layer/mips/convolution1d_mips.cpp b/src/layer/mips/convolution1d_mips.cpp
index fc61c940687..e9cf211e49b 100644
--- a/src/layer/mips/convolution1d_mips.cpp
+++ b/src/layer/mips/convolution1d_mips.cpp
@@ -342,7 +342,7 @@ int Convolution1D_mips::forward(const std::vector<Mat>& bottom_blobs, std::vecto
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/mips/convolution1d_mips.h b/src/layer/mips/convolution1d_mips.h
index 13e66e4f36c..dcc9bd4de4a 100644
--- a/src/layer/mips/convolution1d_mips.h
+++ b/src/layer/mips/convolution1d_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_mips : virtual public Convolution1D
+class Convolution1D_mips : public Convolution1D
 {
 public:
     Convolution1D_mips();
diff --git a/src/layer/mips/convolution_mips.cpp b/src/layer/mips/convolution_mips.cpp
index bc547e4a667..af420e61a9a 100644
--- a/src/layer/mips/convolution_mips.cpp
+++ b/src/layer/mips/convolution_mips.cpp
@@ -225,10 +225,7 @@ int Convolution_mips::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -593,7 +590,7 @@ int Convolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -792,10 +789,7 @@ int Convolution_mips::create_pipeline_int8_mips(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/mips/convolution_mips.h b/src/layer/mips/convolution_mips.h
index e8fe54f87a2..8401c6dfd51 100644
--- a/src/layer/mips/convolution_mips.h
+++ b/src/layer/mips/convolution_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_mips : virtual public Convolution
+class Convolution_mips : public Convolution
 {
 public:
     Convolution_mips();
diff --git a/src/layer/mips/convolutiondepthwise_mips.cpp b/src/layer/mips/convolutiondepthwise_mips.cpp
index 991cb07872d..0c9bdca30ce 100644
--- a/src/layer/mips/convolutiondepthwise_mips.cpp
+++ b/src/layer/mips/convolutiondepthwise_mips.cpp
@@ -83,10 +83,7 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt)
             weight_data_tm = weight_data;
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -94,10 +91,7 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -125,7 +119,7 @@ int ConvolutionDepthWise_mips::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -537,7 +531,7 @@ int ConvolutionDepthWise_mips::forward(const std::vector<Mat>& bottom_blobs, std
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -606,16 +600,15 @@ int ConvolutionDepthWise_mips::create_pipeline_int8_mips(const Option& opt)
             weight_data_tm = weight_data;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/mips/convolutiondepthwise_mips.h b/src/layer/mips/convolutiondepthwise_mips.h
index 9d28009b8a1..24d1650b0c0 100644
--- a/src/layer/mips/convolutiondepthwise_mips.h
+++ b/src/layer/mips/convolutiondepthwise_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_mips : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_mips : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_mips();
diff --git a/src/layer/mips/crop_mips.h b/src/layer/mips/crop_mips.h
index e61c73a44d1..77c077e7153 100644
--- a/src/layer/mips/crop_mips.h
+++ b/src/layer/mips/crop_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_mips : virtual public Crop
+class Crop_mips : public Crop
 {
 public:
     Crop_mips();
diff --git a/src/layer/mips/deconvolution_mips.cpp b/src/layer/mips/deconvolution_mips.cpp
index 506d3072096..208400f532e 100644
--- a/src/layer/mips/deconvolution_mips.cpp
+++ b/src/layer/mips/deconvolution_mips.cpp
@@ -126,10 +126,7 @@ int Deconvolution_mips::create_pipeline(const Option& opt)
     {
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -348,7 +345,7 @@ int Deconvolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vecto
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/mips/deconvolution_mips.h b/src/layer/mips/deconvolution_mips.h
index 218bd812672..b7c0d2e7578 100644
--- a/src/layer/mips/deconvolution_mips.h
+++ b/src/layer/mips/deconvolution_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_mips : virtual public Deconvolution
+class Deconvolution_mips : public Deconvolution
 {
 public:
     Deconvolution_mips();
diff --git a/src/layer/mips/deconvolutiondepthwise_mips.cpp b/src/layer/mips/deconvolutiondepthwise_mips.cpp
index 533bf522ad9..e6f5dd43478 100644
--- a/src/layer/mips/deconvolutiondepthwise_mips.cpp
+++ b/src/layer/mips/deconvolutiondepthwise_mips.cpp
@@ -82,16 +82,15 @@ int DeconvolutionDepthWise_mips::create_pipeline(const Option& opt)
             weight_data_tm = weight_data_transposed;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -119,7 +118,7 @@ int DeconvolutionDepthWise_mips::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -476,7 +475,7 @@ int DeconvolutionDepthWise_mips::forward(const std::vector<Mat>& bottom_blobs, s
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/mips/deconvolutiondepthwise_mips.h b/src/layer/mips/deconvolutiondepthwise_mips.h
index a033d7c11c3..24e7a481edf 100644
--- a/src/layer/mips/deconvolutiondepthwise_mips.h
+++ b/src/layer/mips/deconvolutiondepthwise_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_mips : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_mips : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_mips();
diff --git a/src/layer/mips/dequantize_mips.h b/src/layer/mips/dequantize_mips.h
index 09623e20d4f..8ae7e542c12 100644
--- a/src/layer/mips/dequantize_mips.h
+++ b/src/layer/mips/dequantize_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dequantize_mips : virtual public Dequantize
+class Dequantize_mips : public Dequantize
 {
 public:
     Dequantize_mips();
diff --git a/src/layer/mips/dropout_mips.h b/src/layer/mips/dropout_mips.h
index a5a4dbebb90..05fa38463d7 100644
--- a/src/layer/mips/dropout_mips.h
+++ b/src/layer/mips/dropout_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dropout_mips : virtual public Dropout
+class Dropout_mips : public Dropout
 {
 public:
     Dropout_mips();
diff --git a/src/layer/mips/eltwise_mips.h b/src/layer/mips/eltwise_mips.h
index 55252ec661d..9b4ac77319f 100644
--- a/src/layer/mips/eltwise_mips.h
+++ b/src/layer/mips/eltwise_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Eltwise_mips : virtual public Eltwise
+class Eltwise_mips : public Eltwise
 {
 public:
     Eltwise_mips();
diff --git a/src/layer/mips/flatten_mips.h b/src/layer/mips/flatten_mips.h
index 725ceda6431..c9f33225f98 100644
--- a/src/layer/mips/flatten_mips.h
+++ b/src/layer/mips/flatten_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_mips : virtual public Flatten
+class Flatten_mips : public Flatten
 {
 public:
     Flatten_mips();
diff --git a/src/layer/mips/hardsigmoid_mips.h b/src/layer/mips/hardsigmoid_mips.h
index a1ce9986eca..51cab82627f 100644
--- a/src/layer/mips/hardsigmoid_mips.h
+++ b/src/layer/mips/hardsigmoid_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_mips : virtual public HardSigmoid
+class HardSigmoid_mips : public HardSigmoid
 {
 public:
     HardSigmoid_mips();
diff --git a/src/layer/mips/hardswish_mips.h b/src/layer/mips/hardswish_mips.h
index 692cf22eac2..8ace7fe79f5 100644
--- a/src/layer/mips/hardswish_mips.h
+++ b/src/layer/mips/hardswish_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSwish_mips : virtual public HardSwish
+class HardSwish_mips : public HardSwish
 {
 public:
     HardSwish_mips();
diff --git a/src/layer/mips/innerproduct_mips.cpp b/src/layer/mips/innerproduct_mips.cpp
index b064a20e522..9d926bfd08d 100644
--- a/src/layer/mips/innerproduct_mips.cpp
+++ b/src/layer/mips/innerproduct_mips.cpp
@@ -37,7 +37,7 @@ InnerProduct_mips::InnerProduct_mips()
 int InnerProduct_mips::create_pipeline(const Option& opt)
 {
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
 
         ncnn::ParamDict pd;
 
@@ -99,10 +99,7 @@ int InnerProduct_mips::create_pipeline(const Option& opt)
         weight_data_tm = weight_data;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -655,10 +652,7 @@ int InnerProduct_mips::create_pipeline_fp16s(const Option& opt)
         ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1146,10 +1140,7 @@ int InnerProduct_mips::create_pipeline_int8_mips(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/mips/innerproduct_mips.h b/src/layer/mips/innerproduct_mips.h
index 59b26c53627..c96db3f93d1 100644
--- a/src/layer/mips/innerproduct_mips.h
+++ b/src/layer/mips/innerproduct_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_mips : virtual public InnerProduct
+class InnerProduct_mips : public InnerProduct
 {
 public:
     InnerProduct_mips();
diff --git a/src/layer/mips/interp_mips.h b/src/layer/mips/interp_mips.h
index c15b4990cde..baff10b4e38 100644
--- a/src/layer/mips/interp_mips.h
+++ b/src/layer/mips/interp_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_mips : virtual public Interp
+class Interp_mips : public Interp
 {
 public:
     Interp_mips();
diff --git a/src/layer/mips/mish_mips.h b/src/layer/mips/mish_mips.h
index 68cc9ff6f0f..33342a4f5d3 100644
--- a/src/layer/mips/mish_mips.h
+++ b/src/layer/mips/mish_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_mips : virtual public Mish
+class Mish_mips : public Mish
 {
 public:
     Mish_mips();
diff --git a/src/layer/mips/packing_mips.h b/src/layer/mips/packing_mips.h
index e90536f4908..ccc57f8af7b 100644
--- a/src/layer/mips/packing_mips.h
+++ b/src/layer/mips/packing_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_mips : virtual public Packing
+class Packing_mips : public Packing
 {
 public:
     Packing_mips();
diff --git a/src/layer/mips/padding_mips.h b/src/layer/mips/padding_mips.h
index 3153f3e2b35..6d4ae8c2f70 100644
--- a/src/layer/mips/padding_mips.h
+++ b/src/layer/mips/padding_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_mips : virtual public Padding
+class Padding_mips : public Padding
 {
 public:
     Padding_mips();
diff --git a/src/layer/mips/pooling_mips.h b/src/layer/mips/pooling_mips.h
index dab4038ecca..ec17a06a99c 100644
--- a/src/layer/mips/pooling_mips.h
+++ b/src/layer/mips/pooling_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Pooling_mips : virtual public Pooling
+class Pooling_mips : public Pooling
 {
 public:
     Pooling_mips();
diff --git a/src/layer/mips/prelu_mips.h b/src/layer/mips/prelu_mips.h
index 9ef259ce833..6174c2570c3 100644
--- a/src/layer/mips/prelu_mips.h
+++ b/src/layer/mips/prelu_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_mips : virtual public PReLU
+class PReLU_mips : public PReLU
 {
 public:
     PReLU_mips();
diff --git a/src/layer/mips/quantize_mips.h b/src/layer/mips/quantize_mips.h
index 2607e573f5d..220d73af106 100644
--- a/src/layer/mips/quantize_mips.h
+++ b/src/layer/mips/quantize_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Quantize_mips : virtual public Quantize
+class Quantize_mips : public Quantize
 {
 public:
     Quantize_mips();
diff --git a/src/layer/mips/relu_mips.h b/src/layer/mips/relu_mips.h
index 7fdeae828ef..74e55a6be10 100644
--- a/src/layer/mips/relu_mips.h
+++ b/src/layer/mips/relu_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_mips : virtual public ReLU
+class ReLU_mips : public ReLU
 {
 public:
     ReLU_mips();
diff --git a/src/layer/mips/requantize_mips.h b/src/layer/mips/requantize_mips.h
index a9138b9ea72..6ba740895d2 100644
--- a/src/layer/mips/requantize_mips.h
+++ b/src/layer/mips/requantize_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Requantize_mips : virtual public Requantize
+class Requantize_mips : public Requantize
 {
 public:
     Requantize_mips();
diff --git a/src/layer/mips/sigmoid_mips.h b/src/layer/mips/sigmoid_mips.h
index 7ba089b3b4c..2bf166e954d 100644
--- a/src/layer/mips/sigmoid_mips.h
+++ b/src/layer/mips/sigmoid_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_mips : virtual public Sigmoid
+class Sigmoid_mips : public Sigmoid
 {
 public:
     Sigmoid_mips();
diff --git a/src/layer/mips/slice_mips.h b/src/layer/mips/slice_mips.h
index 648233f8e6c..73274d867a0 100644
--- a/src/layer/mips/slice_mips.h
+++ b/src/layer/mips/slice_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Slice_mips : virtual public Slice
+class Slice_mips : public Slice
 {
 public:
     Slice_mips();
diff --git a/src/layer/mips/softmax_mips.h b/src/layer/mips/softmax_mips.h
index 06ce5e16284..91437c13f56 100644
--- a/src/layer/mips/softmax_mips.h
+++ b/src/layer/mips/softmax_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_mips : virtual public Softmax
+class Softmax_mips : public Softmax
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/mips/swish_mips.h b/src/layer/mips/swish_mips.h
index 706106d9269..1dc6753a381 100644
--- a/src/layer/mips/swish_mips.h
+++ b/src/layer/mips/swish_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_mips : virtual public Swish
+class Swish_mips : public Swish
 {
 public:
     Swish_mips();
diff --git a/src/layer/mips/tanh_mips.h b/src/layer/mips/tanh_mips.h
index d1310f18310..12e38d07f71 100644
--- a/src/layer/mips/tanh_mips.h
+++ b/src/layer/mips/tanh_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_mips : virtual public TanH
+class TanH_mips : public TanH
 {
 public:
     TanH_mips();
diff --git a/src/layer/mips/unaryop_mips.h b/src/layer/mips/unaryop_mips.h
index 0a6f12bc3e5..800d028bb21 100644
--- a/src/layer/mips/unaryop_mips.h
+++ b/src/layer/mips/unaryop_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_mips : virtual public UnaryOp
+class UnaryOp_mips : public UnaryOp
 {
 public:
     UnaryOp_mips();
diff --git a/src/layer/noop.cpp b/src/layer/noop.cpp
index 68572b0ba28..a8b42f70e83 100644
--- a/src/layer/noop.cpp
+++ b/src/layer/noop.cpp
@@ -20,11 +20,9 @@ namespace ncnn {
 Noop::Noop()
 {
     support_inplace = true;
-    support_vulkan = true;
     support_packing = true;
     support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zfh();
     support_bf16_storage = true;
-    support_image_storage = true;
 }
 
 int Noop::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/, const Option& /*opt*/) const
@@ -32,16 +30,4 @@ int Noop::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/, const Option&
     return 0;
 }
 
-#if NCNN_VULKAN
-int Noop::forward_inplace(std::vector<VkMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
-    return 0;
-}
-
-int Noop::forward_inplace(std::vector<VkImageMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
-    return 0;
-}
-#endif // NCNN_VULKAN
-
 } // namespace ncnn
diff --git a/src/layer/noop.h b/src/layer/noop.h
index 1fb7af35c08..75bbdd1a308 100644
--- a/src/layer/noop.h
+++ b/src/layer/noop.h
@@ -25,11 +25,6 @@ class Noop : public Layer
     Noop();
 
     virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
-
-#if NCNN_VULKAN
-    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
-    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
-#endif // NCNN_VULKAN
 };
 
 } // namespace ncnn
diff --git a/src/layer/reduction.cpp b/src/layer/reduction.cpp
index 4d4f7fb578b..55648f8eaf1 100644
--- a/src/layer/reduction.cpp
+++ b/src/layer/reduction.cpp
@@ -1064,7 +1064,11 @@ struct post_process_sqrt
 {
     T operator()(const T& x) const
     {
-        return static_cast<T>(sqrtf(x));
+        // math optimization will probably generate rsqrt
+        // that produce -inf on sse with subnormal input
+        // flush subnormal input to zero as a workaround
+        // TODO explicit use simd sqrt like unaryop     --- nihui
+        return static_cast<T>(sqrtf(x < FLT_MIN ? 0.f : x));
     }
 };
 
diff --git a/src/layer/riscv/absval_riscv.h b/src/layer/riscv/absval_riscv.h
index 66d33c834a8..0d35c6b61a0 100644
--- a/src/layer/riscv/absval_riscv.h
+++ b/src/layer/riscv/absval_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class AbsVal_riscv : virtual public AbsVal
+class AbsVal_riscv : public AbsVal
 {
 public:
     AbsVal_riscv();
diff --git a/src/layer/riscv/batchnorm_riscv.h b/src/layer/riscv/batchnorm_riscv.h
index e2365fa5fcf..1ed4dc63d0d 100644
--- a/src/layer/riscv/batchnorm_riscv.h
+++ b/src/layer/riscv/batchnorm_riscv.h
@@ -18,7 +18,7 @@
 #include "batchnorm.h"
 
 namespace ncnn {
-class BatchNorm_riscv : virtual public BatchNorm
+class BatchNorm_riscv : public BatchNorm
 {
 public:
     BatchNorm_riscv();
diff --git a/src/layer/riscv/binaryop_riscv.h b/src/layer/riscv/binaryop_riscv.h
index 0ecd34d685c..afc728b6e68 100644
--- a/src/layer/riscv/binaryop_riscv.h
+++ b/src/layer/riscv/binaryop_riscv.h
@@ -21,7 +21,7 @@
 
 namespace ncnn {
 
-class BinaryOp_riscv : virtual public BinaryOp
+class BinaryOp_riscv : public BinaryOp
 {
 public:
     BinaryOp_riscv();
diff --git a/src/layer/riscv/cast_riscv.h b/src/layer/riscv/cast_riscv.h
index 4b55159d819..7c6fbb6d4ce 100644
--- a/src/layer/riscv/cast_riscv.h
+++ b/src/layer/riscv/cast_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_riscv : virtual public Cast
+class Cast_riscv : public Cast
 {
 public:
     Cast_riscv();
diff --git a/src/layer/riscv/clip_riscv.h b/src/layer/riscv/clip_riscv.h
index 16a9eb963f9..051995e18d6 100644
--- a/src/layer/riscv/clip_riscv.h
+++ b/src/layer/riscv/clip_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_riscv : virtual public Clip
+class Clip_riscv : public Clip
 {
 public:
     Clip_riscv();
diff --git a/src/layer/riscv/concat_riscv.h b/src/layer/riscv/concat_riscv.h
index eb85d47819d..23029340350 100644
--- a/src/layer/riscv/concat_riscv.h
+++ b/src/layer/riscv/concat_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_riscv : virtual public Concat
+class Concat_riscv : public Concat
 {
 public:
     Concat_riscv();
diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp
index d3d17861d89..6c581a0edeb 100644
--- a/src/layer/riscv/convolution1d_riscv.cpp
+++ b/src/layer/riscv/convolution1d_riscv.cpp
@@ -387,7 +387,7 @@ int Convolution1D_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vect
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -470,6 +470,8 @@ int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
+    weight_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/riscv/convolution1d_riscv.h b/src/layer/riscv/convolution1d_riscv.h
index 2aa4bbe0f41..f0e7f881801 100644
--- a/src/layer/riscv/convolution1d_riscv.h
+++ b/src/layer/riscv/convolution1d_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_riscv : virtual public Convolution1D
+class Convolution1D_riscv : public Convolution1D
 {
 public:
     Convolution1D_riscv();
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index 4c4d57c6a57..be413e5be25 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -237,10 +237,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -677,7 +674,7 @@ int Convolution_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -837,10 +834,7 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h
index 17bb43ca0e5..a4e008c9dd1 100644
--- a/src/layer/riscv/convolution_riscv.h
+++ b/src/layer/riscv/convolution_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_riscv : virtual public Convolution
+class Convolution_riscv : public Convolution
 {
 public:
     Convolution_riscv();
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
index eb39ac0baa7..d913fe7e1d5 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -104,10 +104,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
             weight_data_tm = weight_data;
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -115,10 +112,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -146,7 +140,7 @@ int ConvolutionDepthWise_riscv::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -613,7 +607,7 @@ int ConvolutionDepthWise_riscv::forward(const std::vector<Mat>& bottom_blobs, st
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -688,10 +682,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -699,10 +690,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h
index b0152e0b207..f9503975296 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.h
+++ b/src/layer/riscv/convolutiondepthwise_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_riscv : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_riscv : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_riscv();
diff --git a/src/layer/riscv/crop_riscv.h b/src/layer/riscv/crop_riscv.h
index 86d2c8064e3..404022fafb2 100644
--- a/src/layer/riscv/crop_riscv.h
+++ b/src/layer/riscv/crop_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_riscv : virtual public Crop
+class Crop_riscv : public Crop
 {
 public:
     Crop_riscv();
diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp
index 9202d367f93..6b395282908 100644
--- a/src/layer/riscv/deconvolution_riscv.cpp
+++ b/src/layer/riscv/deconvolution_riscv.cpp
@@ -148,10 +148,7 @@ int Deconvolution_riscv::create_pipeline(const Option& opt)
     {
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -404,7 +401,7 @@ int Deconvolution_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vect
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -533,10 +530,7 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/riscv/deconvolution_riscv.h b/src/layer/riscv/deconvolution_riscv.h
index 903a420427a..57d30349aad 100644
--- a/src/layer/riscv/deconvolution_riscv.h
+++ b/src/layer/riscv/deconvolution_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_riscv : virtual public Deconvolution
+class Deconvolution_riscv : public Deconvolution
 {
 public:
     Deconvolution_riscv();
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
index eee765c4ea6..7b567cf63e0 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -97,10 +97,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
             weight_data_tm = weight_data_transposed;
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -108,10 +105,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -139,7 +133,7 @@ int DeconvolutionDepthWise_riscv::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -531,7 +525,7 @@ int DeconvolutionDepthWise_riscv::forward(const std::vector<Mat>& bottom_blobs,
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -625,10 +619,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -636,10 +627,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.h b/src/layer/riscv/deconvolutiondepthwise_riscv.h
index 5cdbd0d0676..b0c8f7b0119 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.h
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_riscv : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_riscv : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_riscv();
diff --git a/src/layer/riscv/dropout_riscv.h b/src/layer/riscv/dropout_riscv.h
index d685c0ee3b4..9c28d867251 100644
--- a/src/layer/riscv/dropout_riscv.h
+++ b/src/layer/riscv/dropout_riscv.h
@@ -22,7 +22,7 @@
 
 namespace ncnn {
 
-class Dropout_riscv : virtual public Dropout
+class Dropout_riscv : public Dropout
 {
 public:
     Dropout_riscv();
diff --git a/src/layer/riscv/flatten_riscv.h b/src/layer/riscv/flatten_riscv.h
index 52a290ca678..31860340213 100644
--- a/src/layer/riscv/flatten_riscv.h
+++ b/src/layer/riscv/flatten_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_riscv : virtual public Flatten
+class Flatten_riscv : public Flatten
 {
 public:
     Flatten_riscv();
diff --git a/src/layer/riscv/gelu_riscv.h b/src/layer/riscv/gelu_riscv.h
index fbe522694d1..8a2e9492cc9 100644
--- a/src/layer/riscv/gelu_riscv.h
+++ b/src/layer/riscv/gelu_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GELU_riscv : virtual public GELU
+class GELU_riscv : public GELU
 {
 public:
     GELU_riscv();
diff --git a/src/layer/riscv/gemm_riscv.cpp b/src/layer/riscv/gemm_riscv.cpp
index ec5a5cdac41..9b4b58ac651 100644
--- a/src/layer/riscv/gemm_riscv.cpp
+++ b/src/layer/riscv/gemm_riscv.cpp
@@ -99,23 +99,10 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max
                 vfloat32m1_t _r6h = vle32_v_f32m1(p6 + 4, vl);
                 vfloat32m1_t _r7l = vle32_v_f32m1(p7, vl);
                 vfloat32m1_t _r7h = vle32_v_f32m1(p7 + 4, vl);
-                transpose8x8_ps(_r0l, _r0h, _r1l, _r1h, _r2l, _r2h, _r3l, _r3h, _r4l, _r4h, _r5l, _r5h, _r6l, _r6h, _r7l, _r7h, vl);
-                vse32_v_f32m1(pp, _r0l, vl);
-                vse32_v_f32m1(pp + 4, _r0h, vl);
-                vse32_v_f32m1(pp + 8, _r1l, vl);
-                vse32_v_f32m1(pp + 12, _r1h, vl);
-                vse32_v_f32m1(pp + 8 * 2, _r2l, vl);
-                vse32_v_f32m1(pp + 8 * 2 + 4, _r2h, vl);
-                vse32_v_f32m1(pp + 8 * 3, _r3l, vl);
-                vse32_v_f32m1(pp + 8 * 3 + 4, _r3h, vl);
-                vse32_v_f32m1(pp + 8 * 4, _r4l, vl);
-                vse32_v_f32m1(pp + 8 * 4 + 4, _r4h, vl);
-                vse32_v_f32m1(pp + 8 * 5, _r5l, vl);
-                vse32_v_f32m1(pp + 8 * 5 + 4, _r5h, vl);
-                vse32_v_f32m1(pp + 8 * 6, _r6l, vl);
-                vse32_v_f32m1(pp + 8 * 6 + 4, _r6h, vl);
-                vse32_v_f32m1(pp + 8 * 7, _r7l, vl);
-                vse32_v_f32m1(pp + 8 * 7 + 4, _r7h, vl);
+
+                vsseg8e32_v_f32m1(pp, _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl);
+                vsseg8e32_v_f32m1(pp + 32, _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl);
+
                 pp += 64;
                 p0 += 8;
                 p1 += 8;
@@ -175,7 +162,7 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max
                 vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
                 vfloat32m1_t v2 = vle32_v_f32m1(p2, vl);
                 vfloat32m1_t v3 = vle32_v_f32m1(p3, vl);
-                store_float_v4(v0, v1, v2, v3, pp, vl);
+                vsseg4e32_v_f32m1(pp, v0, v1, v2, v3, vl);
                 pp += 16;
                 p0 += 4;
                 p1 += 4;
@@ -210,7 +197,7 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max
             {
                 vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
                 vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
-                store_float_v2(v0, v1, pp, vl);
+                vsseg2e32_v_f32m1(pp, v0, v1, vl);
                 pp += 8;
                 p0 += 4;
                 p1 += 4;
@@ -353,7 +340,7 @@ static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int
             {
                 vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
                 vfloat32m1_t v1 = vle32_v_f32m1(p0 + 4, vl);
-                store_float_v2(v0, v1, pp, vl);
+                vsseg2e32_v_f32m1(pp, v0, v1, vl);
                 pp += 8;
                 p0 += A_hstep * 4;
             }
@@ -562,17 +549,8 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
                 vfloat32m1_t _r6 = vle32_v_f32m1(p6, vl);
                 vfloat32m1_t _r7 = vle32_v_f32m1(p7, vl);
 
-                transpose4x4_ps(_r0, _r1, _r2, _r3, vl);
-                transpose4x4_ps(_r4, _r5, _r6, _r7, vl);
+                vsseg8e32_v_f32m1(pp, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, vl);
 
-                vse32_v_f32m1(pp, _r0, vl);
-                vse32_v_f32m1(pp + 4, _r4, vl);
-                vse32_v_f32m1(pp + 4 * 2, _r1, vl);
-                vse32_v_f32m1(pp + 4 * 3, _r5, vl);
-                vse32_v_f32m1(pp + 4 * 4, _r2, vl);
-                vse32_v_f32m1(pp + 4 * 5, _r6, vl);
-                vse32_v_f32m1(pp + 4 * 6, _r3, vl);
-                vse32_v_f32m1(pp + 4 * 7, _r7, vl);
                 pp += 32;
                 p0 += 4;
                 p1 += 4;
@@ -632,7 +610,7 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
                 vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
                 vfloat32m1_t v2 = vle32_v_f32m1(p2, vl);
                 vfloat32m1_t v3 = vle32_v_f32m1(p3, vl);
-                store_float_v4(v0, v1, v2, v3, pp, vl);
+                vsseg4e32_v_f32m1(pp, v0, v1, v2, v3, vl);
                 pp += 16;
                 p0 += 4;
                 p1 += 4;
@@ -667,7 +645,7 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
             {
                 vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
                 vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
-                store_float_v2(v0, v1, pp, vl);
+                vsseg2e32_v_f32m1(pp, v0, v1, vl);
                 pp += 8;
                 p0 += 4;
                 p1 += 4;
@@ -865,7 +843,7 @@ static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int
             {
                 vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
                 vfloat32m1_t v1 = vle32_v_f32m1(p0 + 4, vl);
-                store_float_v2(v0, v1, pp, vl);
+                vsseg2e32_v_f32m1(pp, v0, v1, vl);
                 pp += 8;
                 p0 += B_hstep * 4;
             }
@@ -937,12 +915,12 @@ static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i,
                 vfloat32m1_t v1 = vle32_v_f32m1(pp + 8, vl);
                 vfloat32m1_t v2 = vle32_v_f32m1(pp + 16, vl);
                 vfloat32m1_t v3 = vle32_v_f32m1(pp + 24, vl);
-                store_float_v4(v0, v1, v2, v3, p0, vl);
+                vsseg4e32_v_f32m1(p0, v0, v1, v2, v3, vl);
                 v0 = vle32_v_f32m1(pp + 4, vl);
                 v1 = vle32_v_f32m1(pp + 12, vl);
                 v2 = vle32_v_f32m1(pp + 20, vl);
                 v3 = vle32_v_f32m1(pp + 28, vl);
-                store_float_v4(v0, v1, v2, v3, p0 + 16, vl);
+                vsseg4e32_v_f32m1(p0 + 16, v0, v1, v2, v3, vl);
                 pp += 32;
                 p0 += out_hstep * 4;
             }
@@ -974,7 +952,7 @@ static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i,
                 vfloat32m1_t v1 = vle32_v_f32m1(pp + 4, vl);
                 vfloat32m1_t v2 = vle32_v_f32m1(pp + 8, vl);
                 vfloat32m1_t v3 = vle32_v_f32m1(pp + 12, vl);
-                store_float_v4(v0, v1, v2, v3, p0, vl);
+                vsseg4e32_v_f32m1(p0, v0, v1, v2, v3, vl);
                 pp += 16;
                 p0 += out_hstep * 4;
             }
@@ -2887,9 +2865,9 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
             }
             else
             {
-                store_float_v2(_sum00, _sum10, outptr, vl);
-                store_float_v2(_sum01, _sum11, outptr + 8, vl);
-                store_float_v2(_sum02, _sum12, outptr + 16, vl);
+                vsseg2e32_v_f32m1(outptr, _sum00, _sum10, vl);
+                vsseg2e32_v_f32m1(outptr + 8, _sum01, _sum11, vl);
+                vsseg2e32_v_f32m1(outptr + 16, _sum02, _sum12, vl);
             }
 
             outptr += 24;
@@ -2974,8 +2952,8 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
             }
             else
             {
-                store_float_v2(_sum00, _sum10, outptr, vl);
-                store_float_v2(_sum01, _sum11, outptr + 8, vl);
+                vsseg2e32_v_f32m1(outptr, _sum00, _sum10, vl);
+                vsseg2e32_v_f32m1(outptr + 8, _sum01, _sum11, vl);
             }
 
             outptr += 16;
@@ -3048,7 +3026,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
             }
             else
             {
-                store_float_v2(_sum0, _sum1, outptr, vl);
+                vsseg2e32_v_f32m1(outptr, _sum0, _sum1, vl);
             }
 
             outptr += 8;
@@ -4006,10 +3984,7 @@ int Gemm_riscv::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -4049,10 +4024,7 @@ int Gemm_riscv::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -4082,10 +4054,7 @@ int Gemm_riscv::create_pipeline(const Option& opt)
             CT_data = C2;
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
diff --git a/src/layer/riscv/gemm_riscv.h b/src/layer/riscv/gemm_riscv.h
index b92add63891..6bca092fb1f 100644
--- a/src/layer/riscv/gemm_riscv.h
+++ b/src/layer/riscv/gemm_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Gemm_riscv : virtual public Gemm
+class Gemm_riscv : public Gemm
 {
 public:
     Gemm_riscv();
diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp
index 28afa5081d0..c7e36c1c0fc 100644
--- a/src/layer/riscv/gru_riscv.cpp
+++ b/src/layer/riscv/gru_riscv.cpp
@@ -714,6 +714,10 @@ int GRU_riscv::create_pipeline_fp16sa(const Option& opt)
     cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt);
     cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt);
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/riscv/gru_riscv.h b/src/layer/riscv/gru_riscv.h
index 18c69ab594b..46bb624519f 100644
--- a/src/layer/riscv/gru_riscv.h
+++ b/src/layer/riscv/gru_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GRU_riscv : virtual public GRU
+class GRU_riscv : public GRU
 {
 public:
     GRU_riscv();
diff --git a/src/layer/riscv/hardsigmoid_riscv.h b/src/layer/riscv/hardsigmoid_riscv.h
index b876c485b62..3c264b3188e 100644
--- a/src/layer/riscv/hardsigmoid_riscv.h
+++ b/src/layer/riscv/hardsigmoid_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_riscv : virtual public HardSigmoid
+class HardSigmoid_riscv : public HardSigmoid
 {
 public:
     HardSigmoid_riscv();
diff --git a/src/layer/riscv/hardswish_riscv.h b/src/layer/riscv/hardswish_riscv.h
index 662cd067024..cfec7916f59 100644
--- a/src/layer/riscv/hardswish_riscv.h
+++ b/src/layer/riscv/hardswish_riscv.h
@@ -22,7 +22,7 @@
 
 namespace ncnn {
 
-class HardSwish_riscv : virtual public HardSwish
+class HardSwish_riscv : public HardSwish
 {
 public:
     HardSwish_riscv();
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index ac7b3169708..accfc683584 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -40,7 +40,7 @@ InnerProduct_riscv::InnerProduct_riscv()
 int InnerProduct_riscv::create_pipeline(const Option& opt)
 {
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
 
         ncnn::ParamDict pd;
 
@@ -106,10 +106,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
         weight_data_tm = weight_data;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -563,10 +560,7 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h
index 0503ea3d4fa..d3056d5801d 100644
--- a/src/layer/riscv/innerproduct_riscv.h
+++ b/src/layer/riscv/innerproduct_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_riscv : virtual public InnerProduct
+class InnerProduct_riscv : public InnerProduct
 {
 public:
     InnerProduct_riscv();
diff --git a/src/layer/riscv/instancenorm_riscv.h b/src/layer/riscv/instancenorm_riscv.h
index 80583cc2c89..b0d2e9004ac 100644
--- a/src/layer/riscv/instancenorm_riscv.h
+++ b/src/layer/riscv/instancenorm_riscv.h
@@ -18,7 +18,7 @@
 #include "instancenorm.h"
 
 namespace ncnn {
-class InstanceNorm_riscv : virtual public InstanceNorm
+class InstanceNorm_riscv : public InstanceNorm
 {
 public:
     InstanceNorm_riscv();
diff --git a/src/layer/riscv/interp_riscv.h b/src/layer/riscv/interp_riscv.h
index 2f6ca89da34..f479223519b 100644
--- a/src/layer/riscv/interp_riscv.h
+++ b/src/layer/riscv/interp_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_riscv : virtual public Interp
+class Interp_riscv : public Interp
 {
 public:
     Interp_riscv();
diff --git a/src/layer/riscv/mish_riscv.h b/src/layer/riscv/mish_riscv.h
index 5421ebb2791..2e2be1a2b44 100644
--- a/src/layer/riscv/mish_riscv.h
+++ b/src/layer/riscv/mish_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_riscv : virtual public Mish
+class Mish_riscv : public Mish
 {
 public:
     Mish_riscv();
diff --git a/src/layer/riscv/packing_riscv.h b/src/layer/riscv/packing_riscv.h
index 4d556890f3f..097d774993c 100644
--- a/src/layer/riscv/packing_riscv.h
+++ b/src/layer/riscv/packing_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_riscv : virtual public Packing
+class Packing_riscv : public Packing
 {
 public:
     Packing_riscv();
diff --git a/src/layer/riscv/padding_riscv.h b/src/layer/riscv/padding_riscv.h
index c591806fa3e..7642dccae5f 100644
--- a/src/layer/riscv/padding_riscv.h
+++ b/src/layer/riscv/padding_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_riscv : virtual public Padding
+class Padding_riscv : public Padding
 {
 public:
     Padding_riscv();
diff --git a/src/layer/riscv/pooling_riscv.h b/src/layer/riscv/pooling_riscv.h
index 48d8feb8233..e285b58eb19 100644
--- a/src/layer/riscv/pooling_riscv.h
+++ b/src/layer/riscv/pooling_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Pooling_riscv : virtual public Pooling
+class Pooling_riscv : public Pooling
 {
 public:
     Pooling_riscv();
diff --git a/src/layer/riscv/prelu_riscv.h b/src/layer/riscv/prelu_riscv.h
index 23e5b7ee998..70acbc5d250 100644
--- a/src/layer/riscv/prelu_riscv.h
+++ b/src/layer/riscv/prelu_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_riscv : virtual public PReLU
+class PReLU_riscv : public PReLU
 {
 public:
     PReLU_riscv();
diff --git a/src/layer/riscv/relu_riscv.h b/src/layer/riscv/relu_riscv.h
index 516f90d3d76..58181b533b8 100644
--- a/src/layer/riscv/relu_riscv.h
+++ b/src/layer/riscv/relu_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_riscv : virtual public ReLU
+class ReLU_riscv : public ReLU
 {
 public:
     ReLU_riscv();
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index 938d3ce3998..e2824646f87 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -86,282 +86,6 @@ static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
     return vloxei32_v_f32m8(ptr, bindex, vl);
 }
 
-static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
-                                   vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
-                                   vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
-                                   vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
-                                   vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
-                                   vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
-                                   vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
-                                   vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl)
-{
-    float tmp[8][8];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0l, vl);
-    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 8, _r0h, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1l, vl);
-    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 8, _r1h, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2l, vl);
-    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 8, _r2h, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3l, vl);
-    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 8, _r3h, vl);
-    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4l, vl);
-    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 8, _r4h, vl);
-    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5l, vl);
-    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 8, _r5h, vl);
-    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6l, vl);
-    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 8, _r6h, vl);
-    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7l, vl);
-    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 8, _r7h, vl);
-    float* ptr = (float*)tmp;
-    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
-    _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
-    _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
-    _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
-    _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
-    _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
-    _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
-    _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
-    _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
-}
-
-static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl)
-{
-    float tmp[4][4];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 4, _r0, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 4, _r1, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 4, _r2, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 4, _r3, vl);
-    float* ptr = (float*)tmp;
-    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
-}
-
-static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
-                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
-                                    vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
-                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
-                                    vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
-                                    vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
-                                    vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
-                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7h,
-                                    vfloat32m1_t& _r8l, vfloat32m1_t& _r8h,
-                                    vfloat32m1_t& _r9l, vfloat32m1_t& _r9h,
-                                    vfloat32m1_t& _ral, vfloat32m1_t& _rah,
-                                    vfloat32m1_t& _rbl, vfloat32m1_t& _rbh, size_t vl)
-{
-    float tmp[8][12];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl);
-    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl);
-    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 12, _r1h, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2l, vl);
-    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 12, _r2h, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3l, vl);
-    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 12, _r3h, vl);
-    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4l, vl);
-    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 12, _r4h, vl);
-    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5l, vl);
-    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 12, _r5h, vl);
-    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6l, vl);
-    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 12, _r6h, vl);
-    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7l, vl);
-    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 12, _r7h, vl);
-    vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8l, vl);
-    vsse32_v_f32m1(&tmp[4][8], sizeof(float) * 12, _r8h, vl);
-    vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9l, vl);
-    vsse32_v_f32m1(&tmp[4][9], sizeof(float) * 12, _r9h, vl);
-    vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ral, vl);
-    vsse32_v_f32m1(&tmp[4][10], sizeof(float) * 12, _rah, vl);
-    vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rbl, vl);
-    vsse32_v_f32m1(&tmp[4][11], sizeof(float) * 12, _rbh, vl);
-    float* ptr = (float*)tmp;
-    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
-    _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
-    _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
-    _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
-    _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
-    _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
-    _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
-    _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
-    _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
-    _r8l = vle32_v_f32m1(ptr + 16 * 4, vl);
-    _r8h = vle32_v_f32m1(ptr + 17 * 4, vl);
-    _r9l = vle32_v_f32m1(ptr + 18 * 4, vl);
-    _r9h = vle32_v_f32m1(ptr + 19 * 4, vl);
-    _ral = vle32_v_f32m1(ptr + 20 * 4, vl);
-    _rah = vle32_v_f32m1(ptr + 21 * 4, vl);
-    _rbl = vle32_v_f32m1(ptr + 22 * 4, vl);
-    _rbh = vle32_v_f32m1(ptr + 23 * 4, vl);
-}
-
-static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vfloat32m1_t& _r0h,
-                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1m, vfloat32m1_t& _r1h,
-                                    vfloat32m1_t& _r2l, vfloat32m1_t& _r2m, vfloat32m1_t& _r2h,
-                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3m, vfloat32m1_t& _r3h,
-                                    vfloat32m1_t& _r4l, vfloat32m1_t& _r4m, vfloat32m1_t& _r4h,
-                                    vfloat32m1_t& _r5l, vfloat32m1_t& _r5m, vfloat32m1_t& _r5h,
-                                    vfloat32m1_t& _r6l, vfloat32m1_t& _r6m, vfloat32m1_t& _r6h,
-                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl)
-{
-    float tmp[12][8];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0l, vl);
-    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 8, _r0m, vl);
-    vsse32_v_f32m1(&tmp[8][0], sizeof(float) * 8, _r0h, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1l, vl);
-    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 8, _r1m, vl);
-    vsse32_v_f32m1(&tmp[8][0], sizeof(float) * 8, _r1h, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2l, vl);
-    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 8, _r2m, vl);
-    vsse32_v_f32m1(&tmp[8][2], sizeof(float) * 8, _r2h, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3l, vl);
-    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 8, _r3m, vl);
-    vsse32_v_f32m1(&tmp[8][3], sizeof(float) * 8, _r3h, vl);
-    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4l, vl);
-    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 8, _r4m, vl);
-    vsse32_v_f32m1(&tmp[8][4], sizeof(float) * 8, _r4h, vl);
-    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5l, vl);
-    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 8, _r5m, vl);
-    vsse32_v_f32m1(&tmp[8][5], sizeof(float) * 8, _r5h, vl);
-    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6l, vl);
-    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 8, _r6m, vl);
-    vsse32_v_f32m1(&tmp[8][6], sizeof(float) * 8, _r6h, vl);
-    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7l, vl);
-    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 8, _r7m, vl);
-    vsse32_v_f32m1(&tmp[8][7], sizeof(float) * 8, _r7h, vl);
-    float* ptr = (float*)tmp;
-    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r0m = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r0h = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r1l = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r1m = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r1h = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r2l = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r2m = vle32_v_f32m1(ptr + 7 * 4, vl);
-    _r2h = vle32_v_f32m1(ptr + 8 * 4, vl);
-    _r3l = vle32_v_f32m1(ptr + 9 * 4, vl);
-    _r3m = vle32_v_f32m1(ptr + 10 * 4, vl);
-    _r3h = vle32_v_f32m1(ptr + 11 * 4, vl);
-    _r4l = vle32_v_f32m1(ptr + 12 * 4, vl);
-    _r4m = vle32_v_f32m1(ptr + 13 * 4, vl);
-    _r4h = vle32_v_f32m1(ptr + 14 * 4, vl);
-    _r5l = vle32_v_f32m1(ptr + 15 * 4, vl);
-    _r5m = vle32_v_f32m1(ptr + 16 * 4, vl);
-    _r5h = vle32_v_f32m1(ptr + 17 * 4, vl);
-    _r6l = vle32_v_f32m1(ptr + 18 * 4, vl);
-    _r6m = vle32_v_f32m1(ptr + 19 * 4, vl);
-    _r6h = vle32_v_f32m1(ptr + 20 * 4, vl);
-    _r7l = vle32_v_f32m1(ptr + 21 * 4, vl);
-    _r7m = vle32_v_f32m1(ptr + 22 * 4, vl);
-    _r7h = vle32_v_f32m1(ptr + 23 * 4, vl);
-}
-
-static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl)
-{
-    float tmp[4][8];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3, vl);
-    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4, vl);
-    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5, vl);
-    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6, vl);
-    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7, vl);
-    float* ptr = (float*)tmp;
-    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
-}
-
-static inline void transpose4x12_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, vfloat32m1_t& _r8, vfloat32m1_t& _r9, vfloat32m1_t& _ra, vfloat32m1_t& _rb, size_t vl)
-{
-    float tmp[4][12];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3, vl);
-    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4, vl);
-    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5, vl);
-    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6, vl);
-    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7, vl);
-    vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8, vl);
-    vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9, vl);
-    vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ra, vl);
-    vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rb, vl);
-    float* ptr = (float*)tmp;
-    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
-    _r8 = vle32_v_f32m1(ptr + 8 * 4, vl);
-    _r9 = vle32_v_f32m1(ptr + 9 * 4, vl);
-    _ra = vle32_v_f32m1(ptr + 10 * 4, vl);
-    _rb = vle32_v_f32m1(ptr + 11 * 4, vl);
-}
-
-static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
-                                   vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
-                                   vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
-                                   vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl)
-{
-    float tmp[8][4];
-    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 4, _r0l, vl);
-    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 4, _r0h, vl);
-    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 4, _r1l, vl);
-    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 4, _r1h, vl);
-    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 4, _r2l, vl);
-    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 4, _r2h, vl);
-    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 4, _r3l, vl);
-    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 4, _r3h, vl);
-    float* ptr = (float*)tmp;
-    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
-    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
-    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
-    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
-    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
-    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
-    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
-    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
-}
-
-static inline void store_float_v2(vfloat32m1_t& vector1, vfloat32m1_t& vector2, float* buf, size_t vl)
-{
-    vsse32_v_f32m1(buf + 0, sizeof(float) * 2, vector1, vl);
-    vsse32_v_f32m1(buf + 1, sizeof(float) * 2, vector2, vl);
-}
-
-static inline void store_float_v4(vfloat32m1_t& vector1, vfloat32m1_t& vector2, vfloat32m1_t& vector3, vfloat32m1_t& vector4, float* buf, size_t vl)
-{
-    vsse32_v_f32m1(buf + 0, sizeof(float) * 4, vector1, vl);
-    vsse32_v_f32m1(buf + 1, sizeof(float) * 4, vector2, vl);
-    vsse32_v_f32m1(buf + 2, sizeof(float) * 4, vector3, vl);
-    vsse32_v_f32m1(buf + 3, sizeof(float) * 4, vector4, vl);
-}
-
 #if __riscv_zfh
 static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr)
 {
@@ -675,4 +399,221 @@ static inline void vlseg2e16_v_f16m4(vfloat16m4_t* v0, vfloat16m4_t* v1, const f
 #endif // __riscv_zfh
 #endif // __riscv_vector
 
+#ifdef __riscv_vector
+
+static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+                                   vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+                                   vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+                                   vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
+                                   vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
+                                   vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
+                                   vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
+                                   vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl)
+{
+    float tmp[64];
+    vsseg8e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl);
+    vsseg8e32_v_f32m1(&tmp[32], _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl);
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
+    _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
+    _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
+    _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
+    _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
+}
+
+static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl)
+{
+    float tmp[16];
+    vsseg4e32_v_f32m1(&tmp[0], _r0, _r1, _r2, _r3, vl);
+    float* ptr = (float*)tmp;
+    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+}
+
+static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+                                    vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
+                                    vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
+                                    vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
+                                    vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
+                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7h,
+                                    vfloat32m1_t& _r8l, vfloat32m1_t& _r8h,
+                                    vfloat32m1_t& _r9l, vfloat32m1_t& _r9h,
+                                    vfloat32m1_t& _ral, vfloat32m1_t& _rah,
+                                    vfloat32m1_t& _rbl, vfloat32m1_t& _rbh, size_t vl)
+{
+    float tmp[8][12];
+
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl);
+    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl);
+    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 12, _r1h, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2l, vl);
+    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 12, _r2h, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3l, vl);
+    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 12, _r3h, vl);
+    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4l, vl);
+    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 12, _r4h, vl);
+    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5l, vl);
+    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 12, _r5h, vl);
+    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6l, vl);
+    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 12, _r6h, vl);
+    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7l, vl);
+    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 12, _r7h, vl);
+    vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8l, vl);
+    vsse32_v_f32m1(&tmp[4][8], sizeof(float) * 12, _r8h, vl);
+    vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9l, vl);
+    vsse32_v_f32m1(&tmp[4][9], sizeof(float) * 12, _r9h, vl);
+    vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ral, vl);
+    vsse32_v_f32m1(&tmp[4][10], sizeof(float) * 12, _rah, vl);
+    vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rbl, vl);
+    vsse32_v_f32m1(&tmp[4][11], sizeof(float) * 12, _rbh, vl);
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
+    _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
+    _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
+    _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
+    _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
+    _r8l = vle32_v_f32m1(ptr + 16 * 4, vl);
+    _r8h = vle32_v_f32m1(ptr + 17 * 4, vl);
+    _r9l = vle32_v_f32m1(ptr + 18 * 4, vl);
+    _r9h = vle32_v_f32m1(ptr + 19 * 4, vl);
+    _ral = vle32_v_f32m1(ptr + 20 * 4, vl);
+    _rah = vle32_v_f32m1(ptr + 21 * 4, vl);
+    _rbl = vle32_v_f32m1(ptr + 22 * 4, vl);
+    _rbh = vle32_v_f32m1(ptr + 23 * 4, vl);
+}
+
+static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vfloat32m1_t& _r0h,
+                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1m, vfloat32m1_t& _r1h,
+                                    vfloat32m1_t& _r2l, vfloat32m1_t& _r2m, vfloat32m1_t& _r2h,
+                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3m, vfloat32m1_t& _r3h,
+                                    vfloat32m1_t& _r4l, vfloat32m1_t& _r4m, vfloat32m1_t& _r4h,
+                                    vfloat32m1_t& _r5l, vfloat32m1_t& _r5m, vfloat32m1_t& _r5h,
+                                    vfloat32m1_t& _r6l, vfloat32m1_t& _r6m, vfloat32m1_t& _r6h,
+                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl)
+{
+    float tmp[96];
+    vsseg8e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl);
+    vsseg8e32_v_f32m1(&tmp[32], _r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m, vl);
+    vsseg8e32_v_f32m1(&tmp[64], _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl);
+
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0m = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r1m = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r2m = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _r3m = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 11 * 4, vl);
+    _r4l = vle32_v_f32m1(ptr + 12 * 4, vl);
+    _r4m = vle32_v_f32m1(ptr + 13 * 4, vl);
+    _r4h = vle32_v_f32m1(ptr + 14 * 4, vl);
+    _r5l = vle32_v_f32m1(ptr + 15 * 4, vl);
+    _r5m = vle32_v_f32m1(ptr + 16 * 4, vl);
+    _r5h = vle32_v_f32m1(ptr + 17 * 4, vl);
+    _r6l = vle32_v_f32m1(ptr + 18 * 4, vl);
+    _r6m = vle32_v_f32m1(ptr + 19 * 4, vl);
+    _r6h = vle32_v_f32m1(ptr + 20 * 4, vl);
+    _r7l = vle32_v_f32m1(ptr + 21 * 4, vl);
+    _r7m = vle32_v_f32m1(ptr + 22 * 4, vl);
+    _r7h = vle32_v_f32m1(ptr + 23 * 4, vl);
+}
+
+static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl)
+{
+    float tmp[32];
+    vsseg8e32_v_f32m1(&tmp[0], _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, vl);
+
+    float* ptr = (float*)tmp;
+    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
+}
+
+static inline void transpose4x12_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, vfloat32m1_t& _r8, vfloat32m1_t& _r9, vfloat32m1_t& _ra, vfloat32m1_t& _rb, size_t vl)
+{
+    float tmp[4][12];
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3, vl);
+    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4, vl);
+    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5, vl);
+    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6, vl);
+    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7, vl);
+    vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8, vl);
+    vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9, vl);
+    vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ra, vl);
+    vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rb, vl);
+    float* ptr = (float*)tmp;
+    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r8 = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r9 = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _ra = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _rb = vle32_v_f32m1(ptr + 11 * 4, vl);
+}
+
+static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+                                   vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+                                   vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+                                   vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl)
+{
+    float tmp[32];
+    vsseg4e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, vl);
+    vsseg4e32_v_f32m1(&tmp[16], _r0h, _r1h, _r2h, _r3h, vl);
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+}
+#endif
+
 #endif // RISCV_USABILITY_H
diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h
index 34f072788e5..980261a1496 100644
--- a/src/layer/riscv/rvv_mathfun.h
+++ b/src/layer/riscv/rvv_mathfun.h
@@ -512,8 +512,8 @@ _RVV_FLOAT32_FMA_HELPER(1)
                                                                                                                                                                                                                                                                                                                                                    \
         vfloat32m##LMUL##_t tu = vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vff_f32m##LMUL(t, c_erfc_ra7, c_erfc_ra6, vl), c_erfc_ra5, vl), c_erfc_ra4, vl), c_erfc_ra3, vl), c_erfc_ra2, vl), c_erfc_ra1, vl), c_erfc_ra0, vl); \
         vfloat32m##LMUL##_t tv = vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vff_f32m##LMUL(t, c_erfc_sa8, c_erfc_sa7, vl), c_erfc_sa6, vl), c_erfc_sa5, vl), c_erfc_sa4, vl), c_erfc_sa3, vl), c_erfc_sa2, vl), c_erfc_sa1, vl); \
-        u = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 0x1.6db6dap+1f, vl), tu, u, vl); /* u = absx < 0x1.6db6dap+1f ? tu : u;*/                                                                                                                                                                                                    \
-        v = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 0x1.6db6dap+1f, vl), tv, v, vl); /* v = absx < 0x1.6db6dap+1f ? tv : v;*/                                                                                                                                                                                                    \
+        u = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 2.857143f, vl), tu, u, vl); /* u = absx < 0x1.6db6dap+1f ? tu : u;*/                                                                                                                                                                                                         \
+        v = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 2.857143f, vl), tv, v, vl); /* v = absx < 0x1.6db6dap+1f ? tv : v;*/                                                                                                                                                                                                         \
                                                                                                                                                                                                                                                                                                                                                    \
         tu = vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vff_f32m##LMUL(t, c_erfc_pa6, c_erfc_pa5, vl), c_erfc_pa4, vl), c_erfc_pa3, vl), c_erfc_pa2, vl), c_erfc_pa1, vl), c_erfc_pa0, vl);                                                               \
         tv = vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vvf_f32m##LMUL(t, vfmadd_vff_f32m##LMUL(t, c_erfc_qa6, c_erfc_qa5, vl), c_erfc_qa4, vl), c_erfc_qa3, vl), c_erfc_qa2, vl), c_erfc_qa1, vl);                                                                                                         \
@@ -532,28 +532,28 @@ _RVV_FLOAT32_FMA_HELPER(1)
         vfloat32m##LMUL##_t q = vfdiv_vv_f32m##LMUL(u, v, vl);                                                                                                                                                                                                                                                                                     \
         vfloat32m##LMUL##_t ret = vfmv_v_f_f32m##LMUL(0.f, vl);                                                                                                                                                                                                                                                                                    \
                                                                                                                                                                                                                                                                                                                                                    \
-    vfloat32m##LMUL##_t z = vreinterpret_v_u32m##LMUL##_f32m##LMUL( vand_vx_u32m##LMUL(vreinterpret_v_f32m##LMUL##_u32m##LMUL(absx), 0xffff'f000, vl));     \
-       \
-    vfloat32m##LMUL##_t r = vfmul_vv_f32m##LMUL( exp_ps(vfmadd_vvf_f32m##LMUL(vfneg_v_f32m##LMUL(z, vl), z, -0.5625f, vl), vl), exp_ps(vfmadd_vv_f32m##LMUL(vfsub_vv_f32m##LMUL(z, absx, vl), vfadd_vv_f32m##LMUL(z, absx, vl), q, vl), vl), vl);     \
-    r = vfdiv_vv_f32m##LMUL(r, absx, vl);     \
-    t = vfrsub_vf_f32m##LMUL(r, 2.f, vl);     \
-    r = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl), t, r, vl); /* x < 0.f ? t:r  */   \
-    ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 28.f, vl), r, ret, vl); /*  abs < 28.f ? r : ret  */   \
-       \
-    r = vfrsub_vf_f32m##LMUL(q, 1.f - c_erfc_erx_f, vl);     \
-    t = vfadd_vf_f32m##LMUL(q, 1.f + c_erfc_erx_f, vl);     \
-    r = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl), t, r, vl); /*  x < 0.f ? t:r*/     \
-    ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 1.25f, vl), r, ret, vl); /*  absx < 1.25f ? r : ret*/     \
-       \
-    r = vfrsub_vf_f32m##LMUL(vfmadd_vv_f32m##LMUL(x, q, vfsub_vf_f32m##LMUL(x, 0.5f, vl), vl), .5, vl);     \
-    ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 0.84375f, vl), r, ret, vl); /*  absx < 0.84375f ? r : ret*/     \
-       \
-    ret = vfmerge_vfm_f32m##LMUL(vmflt_vf_f32m##LMUL##_b##MLEN(x, -6.0f, vl), ret, 2.f, vl); /*  x< -6.0f ? 2.0f: ret*/     \
-       \
-    ret = vmerge_vvm_f32m##LMUL(vmfeq_vv_f32m##LMUL##_b##MLEN(x, x, vl), x, ret, vl); /*  erfc(NaN) = NaN*/     \
-       \
-    return ret;     \
-}
+        vfloat32m##LMUL##_t z = vreinterpret_v_u32m##LMUL##_f32m##LMUL(vand_vx_u32m##LMUL(vreinterpret_v_f32m##LMUL##_u32m##LMUL(absx), 0xfffff000, vl));                                                                                                                                                                                          \
+                                                                                                                                                                                                                                                                                                                                                   \
+        vfloat32m##LMUL##_t r = vfmul_vv_f32m##LMUL(exp_ps(vfmadd_vvf_f32m##LMUL(vfneg_v_f32m##LMUL(z, vl), z, -0.5625f, vl), vl), exp_ps(vfmadd_vv_f32m##LMUL(vfsub_vv_f32m##LMUL(z, absx, vl), vfadd_vv_f32m##LMUL(z, absx, vl), q, vl), vl), vl);                                                                                               \
+        r = vfdiv_vv_f32m##LMUL(r, absx, vl);                                                                                                                                                                                                                                                                                                      \
+        t = vfrsub_vf_f32m##LMUL(r, 2.f, vl);                                                                                                                                                                                                                                                                                                      \
+        r = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl), t, r, vl);         /* x < 0.f ? t:r  */                                                                                                                                                                                                                               \
+        ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 28.f, vl), r, ret, vl); /*  abs < 28.f ? r : ret  */                                                                                                                                                                                                                       \
+                                                                                                                                                                                                                                                                                                                                                   \
+        r = vfrsub_vf_f32m##LMUL(q, 1.f - c_erfc_erx_f, vl);                                                                                                                                                                                                                                                                                       \
+        t = vfadd_vf_f32m##LMUL(q, 1.f + c_erfc_erx_f, vl);                                                                                                                                                                                                                                                                                        \
+        r = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl), t, r, vl);          /*  x < 0.f ? t:r*/                                                                                                                                                                                                                               \
+        ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 1.25f, vl), r, ret, vl); /*  absx < 1.25f ? r : ret*/                                                                                                                                                                                                                      \
+                                                                                                                                                                                                                                                                                                                                                   \
+        r = vfrsub_vf_f32m##LMUL(vfmadd_vv_f32m##LMUL(x, q, vfsub_vf_f32m##LMUL(x, 0.5f, vl), vl), .5, vl);                                                                                                                                                                                                                                        \
+        ret = vmerge_vvm_f32m##LMUL(vmfge_vf_f32m##LMUL##_b##MLEN(absx, 0.84375f, vl), r, ret, vl); /*  absx < 0.84375f ? r : ret*/                                                                                                                                                                                                                \
+                                                                                                                                                                                                                                                                                                                                                   \
+        ret = vfmerge_vfm_f32m##LMUL(vmflt_vf_f32m##LMUL##_b##MLEN(x, -6.0f, vl), ret, 2.f, vl); /*  x< -6.0f ? 2.0f: ret*/                                                                                                                                                                                                                        \
+                                                                                                                                                                                                                                                                                                                                                   \
+        ret = vmerge_vvm_f32m##LMUL(vmfeq_vv_f32m##LMUL##_b##MLEN(x, x, vl), x, ret, vl); /*  erfc(NaN) = NaN*/                                                                                                                                                                                                                                    \
+                                                                                                                                                                                                                                                                                                                                                   \
+        return ret;                                                                                                                                                                                                                                                                                                                                \
+    }
 
 _RVV_FLOAT32_ERFC_OP(1, 32)
 _RVV_FLOAT32_ERFC_OP(2, 16)
diff --git a/src/layer/riscv/selu_riscv.h b/src/layer/riscv/selu_riscv.h
index 2cd552fb9b8..185b7f5b2c8 100644
--- a/src/layer/riscv/selu_riscv.h
+++ b/src/layer/riscv/selu_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class SELU_riscv : virtual public SELU
+class SELU_riscv : public SELU
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/sigmoid_riscv.h b/src/layer/riscv/sigmoid_riscv.h
index 2b4b33b7cbe..8f014e6c4f2 100644
--- a/src/layer/riscv/sigmoid_riscv.h
+++ b/src/layer/riscv/sigmoid_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_riscv : virtual public Sigmoid
+class Sigmoid_riscv : public Sigmoid
 {
 public:
     Sigmoid_riscv();
diff --git a/src/layer/riscv/softmax_riscv.h b/src/layer/riscv/softmax_riscv.h
index bb39b5e3ba8..f93dc3022e1 100644
--- a/src/layer/riscv/softmax_riscv.h
+++ b/src/layer/riscv/softmax_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_riscv : virtual public Softmax
+class Softmax_riscv : public Softmax
 {
 public:
     Softmax_riscv();
diff --git a/src/layer/riscv/swish_riscv.h b/src/layer/riscv/swish_riscv.h
index 00de62fce4c..05d5cbe1cfd 100644
--- a/src/layer/riscv/swish_riscv.h
+++ b/src/layer/riscv/swish_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_riscv : virtual public Swish
+class Swish_riscv : public Swish
 {
 public:
     Swish_riscv();
diff --git a/src/layer/riscv/tanh_riscv.h b/src/layer/riscv/tanh_riscv.h
index c7038ef4f3e..6fb22ce91f3 100644
--- a/src/layer/riscv/tanh_riscv.h
+++ b/src/layer/riscv/tanh_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_riscv : virtual public TanH
+class TanH_riscv : public TanH
 {
 public:
     TanH_riscv();
diff --git a/src/layer/riscv/unaryop_riscv.h b/src/layer/riscv/unaryop_riscv.h
index 7e4e4fa8bfe..215ad3426a4 100644
--- a/src/layer/riscv/unaryop_riscv.h
+++ b/src/layer/riscv/unaryop_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_riscv : virtual public UnaryOp
+class UnaryOp_riscv : public UnaryOp
 {
 public:
     UnaryOp_riscv();
diff --git a/src/layer/split.cpp b/src/layer/split.cpp
index f79fce0f15c..996624dfe7a 100644
--- a/src/layer/split.cpp
+++ b/src/layer/split.cpp
@@ -21,11 +21,9 @@ Split::Split()
 {
     one_blob_only = false;
     support_inplace = false;
-    support_vulkan = true;
     support_packing = true;
     support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zfh();
     support_bf16_storage = true;
-    support_image_storage = true;
 }
 
 int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& /*opt*/) const
@@ -39,28 +37,4 @@ int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
     return 0;
 }
 
-#if NCNN_VULKAN
-int Split::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
-    const VkMat& bottom_blob = bottom_blobs[0];
-    for (size_t i = 0; i < top_blobs.size(); i++)
-    {
-        top_blobs[i] = bottom_blob;
-    }
-
-    return 0;
-}
-
-int Split::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
-    const VkImageMat& bottom_blob = bottom_blobs[0];
-    for (size_t i = 0; i < top_blobs.size(); i++)
-    {
-        top_blobs[i] = bottom_blob;
-    }
-
-    return 0;
-}
-#endif // NCNN_VULKAN
-
 } // namespace ncnn
diff --git a/src/layer/split.h b/src/layer/split.h
index 7437866cfc5..53686f82be3 100644
--- a/src/layer/split.h
+++ b/src/layer/split.h
@@ -25,13 +25,6 @@ class Split : public Layer
     Split();
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
-
-#if NCNN_VULKAN
-    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
-    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
-#endif // NCNN_VULKAN
-
-public:
 };
 
 } // namespace ncnn
diff --git a/src/layer/vulkan/absval_vulkan.h b/src/layer/vulkan/absval_vulkan.h
index d14c2ac5388..9652aac9b16 100644
--- a/src/layer/vulkan/absval_vulkan.h
+++ b/src/layer/vulkan/absval_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class AbsVal_vulkan : virtual public AbsVal
+class AbsVal_vulkan : public AbsVal
 {
 public:
     AbsVal_vulkan();
diff --git a/src/layer/vulkan/batchnorm_vulkan.h b/src/layer/vulkan/batchnorm_vulkan.h
index 783b84b6efb..eedf049167d 100644
--- a/src/layer/vulkan/batchnorm_vulkan.h
+++ b/src/layer/vulkan/batchnorm_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BatchNorm_vulkan : virtual public BatchNorm
+class BatchNorm_vulkan : public BatchNorm
 {
 public:
     BatchNorm_vulkan();
diff --git a/src/layer/vulkan/binaryop_vulkan.h b/src/layer/vulkan/binaryop_vulkan.h
index 97ebcacc9f6..1c66186a0c3 100644
--- a/src/layer/vulkan/binaryop_vulkan.h
+++ b/src/layer/vulkan/binaryop_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BinaryOp_vulkan : virtual public BinaryOp
+class BinaryOp_vulkan : public BinaryOp
 {
 public:
     BinaryOp_vulkan();
diff --git a/src/layer/vulkan/cast_vulkan.h b/src/layer/vulkan/cast_vulkan.h
index c184c7439ac..47ce3b27920 100644
--- a/src/layer/vulkan/cast_vulkan.h
+++ b/src/layer/vulkan/cast_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_vulkan : virtual public Cast
+class Cast_vulkan : public Cast
 {
 public:
     Cast_vulkan();
diff --git a/src/layer/vulkan/celu_vulkan.h b/src/layer/vulkan/celu_vulkan.h
index b5e25e19b4d..2c03a4b9c98 100644
--- a/src/layer/vulkan/celu_vulkan.h
+++ b/src/layer/vulkan/celu_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class CELU_vulkan : virtual public CELU
+class CELU_vulkan : public CELU
 {
 public:
     CELU_vulkan();
diff --git a/src/layer/vulkan/clip_vulkan.h b/src/layer/vulkan/clip_vulkan.h
index ea73eacd050..79e7745f0c4 100644
--- a/src/layer/vulkan/clip_vulkan.h
+++ b/src/layer/vulkan/clip_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_vulkan : virtual public Clip
+class Clip_vulkan : public Clip
 {
 public:
     Clip_vulkan();
diff --git a/src/layer/vulkan/concat_vulkan.h b/src/layer/vulkan/concat_vulkan.h
index 3db05044ea9..109750f3d8d 100644
--- a/src/layer/vulkan/concat_vulkan.h
+++ b/src/layer/vulkan/concat_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_vulkan : virtual public Concat
+class Concat_vulkan : public Concat
 {
 public:
     Concat_vulkan();
diff --git a/src/layer/vulkan/convolution1d_vulkan.cpp b/src/layer/vulkan/convolution1d_vulkan.cpp
index 53dff49262b..2747012addc 100644
--- a/src/layer/vulkan/convolution1d_vulkan.cpp
+++ b/src/layer/vulkan/convolution1d_vulkan.cpp
@@ -29,15 +29,21 @@ Convolution1D_vulkan::Convolution1D_vulkan()
     pipeline_convolution1d = 0;
 }
 
-int Convolution1D_vulkan::create_pipeline(const Option& _opt)
+int Convolution1D_vulkan::load_param(const ParamDict& pd)
 {
+    int ret = Convolution1D::load_param(pd);
+
     if (dynamic_weight)
     {
         support_vulkan = false;
         support_image_storage = false;
-        return 0;
     }
 
+    return ret;
+}
+
+int Convolution1D_vulkan::create_pipeline(const Option& _opt)
+{
     Option opt = _opt;
 
     const int maxk = kernel_w;
@@ -47,7 +53,7 @@ int Convolution1D_vulkan::create_pipeline(const Option& _opt)
     int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
 
     {
-        padding = ncnn::create_layer(ncnn::LayerType::Padding);
+        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
         padding->vkdev = vkdev;
 
         ncnn::ParamDict pd;
@@ -127,6 +133,9 @@ int Convolution1D_vulkan::create_pipeline(const Option& _opt)
         pipeline_convolution1d->create(shader_type_index, opt, specializations);
     }
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/convolution1d_vulkan.h b/src/layer/vulkan/convolution1d_vulkan.h
index 4fb22040daa..28d692ae618 100644
--- a/src/layer/vulkan/convolution1d_vulkan.h
+++ b/src/layer/vulkan/convolution1d_vulkan.h
@@ -19,11 +19,13 @@
 
 namespace ncnn {
 
-class Convolution1D_vulkan : virtual public Convolution1D
+class Convolution1D_vulkan : public Convolution1D
 {
 public:
     Convolution1D_vulkan();
 
+    virtual int load_param(const ParamDict& pd);
+
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index beb0bccb9bf..302ab9085c5 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -43,15 +43,21 @@ Convolution_vulkan::Convolution_vulkan()
     reshape_w = 0;
 }
 
-int Convolution_vulkan::create_pipeline(const Option& _opt)
+int Convolution_vulkan::load_param(const ParamDict& pd)
 {
+    int ret = Convolution::load_param(pd);
+
     if (dynamic_weight)
     {
         support_vulkan = false;
         support_image_storage = false;
-        return 0;
     }
 
+    return ret;
+}
+
+int Convolution_vulkan::create_pipeline(const Option& _opt)
+{
     Option opt = _opt;
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
@@ -117,7 +123,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
     if (kernel_w == 1 && kernel_h == 1)
     {
         {
-            reshape_1x1xw = ncnn::create_layer(ncnn::LayerType::Reshape);
+            reshape_1x1xw = ncnn::create_layer_vulkan(ncnn::LayerType::Reshape);
             reshape_1x1xw->vkdev = vkdev;
 
             reshape_1x1xw->bottom_shapes.resize(1);
@@ -136,7 +142,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         }
 
         {
-            reshape_w = ncnn::create_layer(ncnn::LayerType::Reshape);
+            reshape_w = ncnn::create_layer_vulkan(ncnn::LayerType::Reshape);
             reshape_w->vkdev = vkdev;
 
             reshape_w->bottom_shapes.resize(1);
@@ -157,7 +163,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
     bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
 
     {
-        padding = ncnn::create_layer(ncnn::LayerType::Padding);
+        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
         padding->vkdev = vkdev;
 
         padding->bottom_shapes.resize(1);
@@ -1142,6 +1148,9 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         pipeline_convolution->create(shader_type_index, opt, specializations);
     }
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/convolution_vulkan.h b/src/layer/vulkan/convolution_vulkan.h
index 0efa76fec5c..fa4bdbc5350 100644
--- a/src/layer/vulkan/convolution_vulkan.h
+++ b/src/layer/vulkan/convolution_vulkan.h
@@ -19,11 +19,13 @@
 
 namespace ncnn {
 
-class Convolution_vulkan : virtual public Convolution
+class Convolution_vulkan : public Convolution
 {
 public:
     Convolution_vulkan();
 
+    virtual int load_param(const ParamDict& pd);
+
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
index 57069074c96..59eca6a55c6 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -41,15 +41,21 @@ ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
     pipeline_convolutiondepthwise_group_pack8to1 = 0;
 }
 
-int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+int ConvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
 {
+    int ret = ConvolutionDepthWise::load_param(pd);
+
     if (dynamic_weight)
     {
         support_vulkan = false;
         support_image_storage = false;
-        return 0;
     }
 
+    return ret;
+}
+
+int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+{
     Option opt = _opt;
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
@@ -177,7 +183,7 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        padding = ncnn::create_layer(ncnn::LayerType::Padding);
+        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
         padding->vkdev = vkdev;
 
         padding->bottom_shapes.resize(1);
@@ -265,6 +271,9 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
             pipeline_convolutiondepthwise_pack8->create(LayerShaderType::convolutiondepthwise_pack8, opt, specializations);
         }
 
+        weight_data.release();
+        bias_data.release();
+
         return 0;
     }
 
@@ -404,6 +413,9 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
         pipeline_convolutiondepthwise_group_pack8to1->create(LayerShaderType::convolutiondepthwise_group_pack8to1, opt, specializations);
     }
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.h b/src/layer/vulkan/convolutiondepthwise_vulkan.h
index 3689e369c2b..7a6cfe1f640 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.h
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.h
@@ -19,11 +19,13 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_vulkan : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_vulkan : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_vulkan();
 
+    virtual int load_param(const ParamDict& pd);
+
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
diff --git a/src/layer/vulkan/crop_vulkan.h b/src/layer/vulkan/crop_vulkan.h
index e60b77f5e7c..4480268849a 100644
--- a/src/layer/vulkan/crop_vulkan.h
+++ b/src/layer/vulkan/crop_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_vulkan : virtual public Crop
+class Crop_vulkan : public Crop
 {
 public:
     Crop_vulkan();
diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp
index c53aedefc84..66e57db57bf 100644
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -33,15 +33,21 @@ Deconvolution_vulkan::Deconvolution_vulkan()
     pipeline_deconvolution_col2im = 0;
 }
 
-int Deconvolution_vulkan::create_pipeline(const Option& _opt)
+int Deconvolution_vulkan::load_param(const ParamDict& pd)
 {
+    int ret = Deconvolution::load_param(pd);
+
     if (dynamic_weight)
     {
         support_vulkan = false;
         support_image_storage = false;
-        return 0;
     }
 
+    return ret;
+}
+
+int Deconvolution_vulkan::create_pipeline(const Option& _opt)
+{
     Option opt = _opt;
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
@@ -109,7 +115,7 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        crop = ncnn::create_layer(ncnn::LayerType::Crop);
+        crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
         crop->vkdev = vkdev;
 
         crop->bottom_shapes.resize(1);
@@ -128,7 +134,7 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        output_crop = ncnn::create_layer(ncnn::LayerType::Crop);
+        output_crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
         output_crop->vkdev = vkdev;
 
         output_crop->bottom_shapes.resize(1);
@@ -456,6 +462,9 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
     pipeline_deconvolution->set_optimal_local_size_xyz(local_size_xyz);
     pipeline_deconvolution->create(shader_type_index, opt, specializations);
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/deconvolution_vulkan.h b/src/layer/vulkan/deconvolution_vulkan.h
index 578bdc96747..6e18c38d681 100644
--- a/src/layer/vulkan/deconvolution_vulkan.h
+++ b/src/layer/vulkan/deconvolution_vulkan.h
@@ -19,11 +19,13 @@
 
 namespace ncnn {
 
-class Deconvolution_vulkan : virtual public Deconvolution
+class Deconvolution_vulkan : public Deconvolution
 {
 public:
     Deconvolution_vulkan();
 
+    virtual int load_param(const ParamDict& pd);
+
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
index b24418fa428..a715a4782f4 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
@@ -42,15 +42,21 @@ DeconvolutionDepthWise_vulkan::DeconvolutionDepthWise_vulkan()
     pipeline_deconvolutiondepthwise_group_pack8to1 = 0;
 }
 
-int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+int DeconvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
 {
+    int ret = DeconvolutionDepthWise::load_param(pd);
+
     if (dynamic_weight)
     {
         support_vulkan = false;
         support_image_storage = false;
-        return 0;
     }
 
+    return ret;
+}
+
+int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+{
     Option opt = _opt;
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
@@ -168,7 +174,7 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        crop = ncnn::create_layer(ncnn::LayerType::Crop);
+        crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
         crop->vkdev = vkdev;
 
         crop->bottom_shapes.resize(1);
@@ -187,7 +193,7 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        output_crop = ncnn::create_layer(ncnn::LayerType::Crop);
+        output_crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
         output_crop->vkdev = vkdev;
 
         output_crop->bottom_shapes.resize(1);
@@ -289,6 +295,9 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
             pipeline_deconvolutiondepthwise_pack8->create(LayerShaderType::deconvolutiondepthwise_pack8, opt, specializations);
         }
 
+        weight_data.release();
+        bias_data.release();
+
         return 0;
     }
 
@@ -428,6 +437,9 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
         pipeline_deconvolutiondepthwise_group_pack8to1->create(LayerShaderType::deconvolutiondepthwise_group_pack8to1, opt, specializations);
     }
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.h b/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
index bf38f254eb5..5346de8e628 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
@@ -19,11 +19,13 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_vulkan : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_vulkan : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_vulkan();
 
+    virtual int load_param(const ParamDict& pd);
+
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
diff --git a/src/layer/vulkan/deepcopy_vulkan.h b/src/layer/vulkan/deepcopy_vulkan.h
index a7a89d17a67..867ff1af454 100644
--- a/src/layer/vulkan/deepcopy_vulkan.h
+++ b/src/layer/vulkan/deepcopy_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeepCopy_vulkan : virtual public DeepCopy
+class DeepCopy_vulkan : public DeepCopy
 {
 public:
     DeepCopy_vulkan();
diff --git a/src/layer/vulkan/dropout_vulkan.h b/src/layer/vulkan/dropout_vulkan.h
index da2e9ad6051..e45159b7659 100644
--- a/src/layer/vulkan/dropout_vulkan.h
+++ b/src/layer/vulkan/dropout_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dropout_vulkan : virtual public Dropout
+class Dropout_vulkan : public Dropout
 {
 public:
     Dropout_vulkan();
diff --git a/src/layer/vulkan/eltwise_vulkan.h b/src/layer/vulkan/eltwise_vulkan.h
index 2516db55dd2..09418657186 100644
--- a/src/layer/vulkan/eltwise_vulkan.h
+++ b/src/layer/vulkan/eltwise_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Eltwise_vulkan : virtual public Eltwise
+class Eltwise_vulkan : public Eltwise
 {
 public:
     Eltwise_vulkan();
diff --git a/src/layer/vulkan/elu_vulkan.h b/src/layer/vulkan/elu_vulkan.h
index 62da80a00c5..c616c3be1b9 100644
--- a/src/layer/vulkan/elu_vulkan.h
+++ b/src/layer/vulkan/elu_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ELU_vulkan : virtual public ELU
+class ELU_vulkan : public ELU
 {
 public:
     ELU_vulkan();
diff --git a/src/layer/vulkan/erf_vulkan.h b/src/layer/vulkan/erf_vulkan.h
index c793c558687..3f2ae5ace64 100644
--- a/src/layer/vulkan/erf_vulkan.h
+++ b/src/layer/vulkan/erf_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Erf_vulkan : virtual public Erf
+class Erf_vulkan : public Erf
 {
 public:
     Erf_vulkan();
diff --git a/src/layer/vulkan/flatten_vulkan.h b/src/layer/vulkan/flatten_vulkan.h
index 510cab1285f..1068ce547c3 100644
--- a/src/layer/vulkan/flatten_vulkan.h
+++ b/src/layer/vulkan/flatten_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_vulkan : virtual public Flatten
+class Flatten_vulkan : public Flatten
 {
 public:
     Flatten_vulkan();
diff --git a/src/layer/vulkan/gelu_vulkan.h b/src/layer/vulkan/gelu_vulkan.h
index 2c04bc40ba1..ced6f07af4d 100644
--- a/src/layer/vulkan/gelu_vulkan.h
+++ b/src/layer/vulkan/gelu_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GELU_vulkan : virtual public GELU
+class GELU_vulkan : public GELU
 {
 public:
     GELU_vulkan();
diff --git a/src/layer/vulkan/gemm_vulkan.cpp b/src/layer/vulkan/gemm_vulkan.cpp
index ad768c63dd2..f30fa552f11 100644
--- a/src/layer/vulkan/gemm_vulkan.cpp
+++ b/src/layer/vulkan/gemm_vulkan.cpp
@@ -100,6 +100,10 @@ int Gemm_vulkan::create_pipeline(const Option& opt)
         pipeline_gemm->create(LayerShaderType::gemm, opt, specializations);
     }
 
+    A_data.release();
+    B_data.release();
+    C_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/gemm_vulkan.h b/src/layer/vulkan/gemm_vulkan.h
index 4edbc2f5472..d9fa92018e4 100644
--- a/src/layer/vulkan/gemm_vulkan.h
+++ b/src/layer/vulkan/gemm_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Gemm_vulkan : virtual public Gemm
+class Gemm_vulkan : public Gemm
 {
 public:
     Gemm_vulkan();
diff --git a/src/layer/vulkan/hardsigmoid_vulkan.h b/src/layer/vulkan/hardsigmoid_vulkan.h
index 23ea48e2959..b0902948c7b 100644
--- a/src/layer/vulkan/hardsigmoid_vulkan.h
+++ b/src/layer/vulkan/hardsigmoid_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_vulkan : virtual public HardSigmoid
+class HardSigmoid_vulkan : public HardSigmoid
 {
 public:
     HardSigmoid_vulkan();
diff --git a/src/layer/vulkan/hardswish_vulkan.h b/src/layer/vulkan/hardswish_vulkan.h
index cd5f93f1d76..ab4726877ef 100644
--- a/src/layer/vulkan/hardswish_vulkan.h
+++ b/src/layer/vulkan/hardswish_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSwish_vulkan : virtual public HardSwish
+class HardSwish_vulkan : public HardSwish
 {
 public:
     HardSwish_vulkan();
diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp
index 06bf7b56943..ee73d4bb4ac 100644
--- a/src/layer/vulkan/innerproduct_vulkan.cpp
+++ b/src/layer/vulkan/innerproduct_vulkan.cpp
@@ -154,6 +154,9 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
         pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
         pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations);
 
+        weight_data.release();
+        bias_data.release();
+
         return 0;
     }
 
@@ -214,7 +217,7 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_vulkan(ncnn::LayerType::Flatten);
         flatten->vkdev = vkdev;
 
         flatten->bottom_shapes.resize(1);
@@ -361,9 +364,15 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
         pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
         pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations);
 
+        weight_data.release();
+        bias_data.release();
+
         return 0;
     }
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/innerproduct_vulkan.h b/src/layer/vulkan/innerproduct_vulkan.h
index 4fe138d480f..9002c581c92 100644
--- a/src/layer/vulkan/innerproduct_vulkan.h
+++ b/src/layer/vulkan/innerproduct_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_vulkan : virtual public InnerProduct
+class InnerProduct_vulkan : public InnerProduct
 {
 public:
     InnerProduct_vulkan();
diff --git a/src/layer/vulkan/instancenorm_vulkan.h b/src/layer/vulkan/instancenorm_vulkan.h
index 6ff269d9fab..943fff65aee 100644
--- a/src/layer/vulkan/instancenorm_vulkan.h
+++ b/src/layer/vulkan/instancenorm_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InstanceNorm_vulkan : virtual public InstanceNorm
+class InstanceNorm_vulkan : public InstanceNorm
 {
 public:
     InstanceNorm_vulkan();
diff --git a/src/layer/vulkan/interp_vulkan.h b/src/layer/vulkan/interp_vulkan.h
index 94724a78689..5f1752341fe 100644
--- a/src/layer/vulkan/interp_vulkan.h
+++ b/src/layer/vulkan/interp_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_vulkan : virtual public Interp
+class Interp_vulkan : public Interp
 {
 public:
     Interp_vulkan();
diff --git a/src/layer/vulkan/lrn_vulkan.h b/src/layer/vulkan/lrn_vulkan.h
index 30b3f0cee80..ad8cc99348d 100644
--- a/src/layer/vulkan/lrn_vulkan.h
+++ b/src/layer/vulkan/lrn_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LRN_vulkan : virtual public LRN
+class LRN_vulkan : public LRN
 {
 public:
     LRN_vulkan();
diff --git a/src/layer/vulkan/memorydata_vulkan.h b/src/layer/vulkan/memorydata_vulkan.h
index 7ba21283b75..32655abdcae 100644
--- a/src/layer/vulkan/memorydata_vulkan.h
+++ b/src/layer/vulkan/memorydata_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MemoryData_vulkan : virtual public MemoryData
+class MemoryData_vulkan : public MemoryData
 {
 public:
     MemoryData_vulkan();
diff --git a/src/layer/vulkan/mish_vulkan.h b/src/layer/vulkan/mish_vulkan.h
index 762e331bfc6..864884382de 100644
--- a/src/layer/vulkan/mish_vulkan.h
+++ b/src/layer/vulkan/mish_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_vulkan : virtual public Mish
+class Mish_vulkan : public Mish
 {
 public:
     Mish_vulkan();
diff --git a/src/layer/vulkan/multiheadattention_vulkan.cpp b/src/layer/vulkan/multiheadattention_vulkan.cpp
index acb28869382..411b81b05e9 100644
--- a/src/layer/vulkan/multiheadattention_vulkan.cpp
+++ b/src/layer/vulkan/multiheadattention_vulkan.cpp
@@ -49,7 +49,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
     {
         const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
 
-        q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        q_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
         q_gemm->vkdev = vkdev;
         ncnn::ParamDict pd;
         pd.set(0, inv_sqrt_embed_dim_per_head);
@@ -72,10 +72,13 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
         weights[1] = q_bias_data;
         q_gemm->load_model(ModelBinFromMatArray(weights));
         q_gemm->create_pipeline(opt);
+
+        q_weight_data.release();
+        q_bias_data.release();
     }
 
     {
-        k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        k_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
         k_gemm->vkdev = vkdev;
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
@@ -96,10 +99,13 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
         weights[1] = k_bias_data;
         k_gemm->load_model(ModelBinFromMatArray(weights));
         k_gemm->create_pipeline(opt);
+
+        k_weight_data.release();
+        k_bias_data.release();
     }
 
     {
-        v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        v_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
         v_gemm->vkdev = vkdev;
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
@@ -120,6 +126,9 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
         weights[1] = v_bias_data;
         v_gemm->load_model(ModelBinFromMatArray(weights));
         v_gemm->create_pipeline(opt);
+
+        v_weight_data.release();
+        v_bias_data.release();
     }
 
     {
@@ -182,7 +191,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
     }
 
     {
-        qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+        qk_softmax = ncnn::create_layer_vulkan(ncnn::LayerType::Softmax);
         qk_softmax->vkdev = vkdev;
         ncnn::ParamDict pd;
         pd.set(0, -1);
@@ -193,7 +202,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
     }
 
     {
-        o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        o_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
         o_gemm->vkdev = vkdev;
         ncnn::ParamDict pd;
         pd.set(2, 1);         // transA
@@ -212,6 +221,9 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
         weights[1] = out_bias_data;
         o_gemm->load_model(ModelBinFromMatArray(weights));
         o_gemm->create_pipeline(opt);
+
+        out_weight_data.release();
+        out_bias_data.release();
     }
 
     return 0;
diff --git a/src/layer/vulkan/multiheadattention_vulkan.h b/src/layer/vulkan/multiheadattention_vulkan.h
index 49662db47a2..3b77d96db48 100644
--- a/src/layer/vulkan/multiheadattention_vulkan.h
+++ b/src/layer/vulkan/multiheadattention_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MultiHeadAttention_vulkan : virtual public MultiHeadAttention
+class MultiHeadAttention_vulkan : public MultiHeadAttention
 {
 public:
     MultiHeadAttention_vulkan();
diff --git a/src/layer/vulkan/noop_vulkan.cpp b/src/layer/vulkan/noop_vulkan.cpp
new file mode 100644
index 00000000000..3a59d2613a3
--- /dev/null
+++ b/src/layer/vulkan/noop_vulkan.cpp
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "noop_vulkan.h"
+
+namespace ncnn {
+
+Noop_vulkan::Noop_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+}
+
+int Noop_vulkan::forward_inplace(std::vector<VkMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+    return 0;
+}
+
+int Noop_vulkan::forward_inplace(std::vector<VkImageMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/noop_vulkan.h b/src/layer/vulkan/noop_vulkan.h
new file mode 100644
index 00000000000..84d05d07a80
--- /dev/null
+++ b/src/layer/vulkan/noop_vulkan.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_NOOP_VULKAN_H
+#define LAYER_NOOP_VULKAN_H
+
+#include "noop.h"
+
+namespace ncnn {
+
+class Noop_vulkan : public Noop
+{
+public:
+    Noop_vulkan();
+
+    using Noop::forward;
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_NOOP_VULKAN_H
diff --git a/src/layer/vulkan/normalize_vulkan.h b/src/layer/vulkan/normalize_vulkan.h
index ca44828df1a..4ad20cc457f 100644
--- a/src/layer/vulkan/normalize_vulkan.h
+++ b/src/layer/vulkan/normalize_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Normalize_vulkan : virtual public Normalize
+class Normalize_vulkan : public Normalize
 {
 public:
     Normalize_vulkan();
diff --git a/src/layer/vulkan/packing_vulkan.h b/src/layer/vulkan/packing_vulkan.h
index 954698f98dd..fb9d1cd154f 100644
--- a/src/layer/vulkan/packing_vulkan.h
+++ b/src/layer/vulkan/packing_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_vulkan : virtual public Packing
+class Packing_vulkan : public Packing
 {
 public:
     Packing_vulkan();
diff --git a/src/layer/vulkan/padding_vulkan.h b/src/layer/vulkan/padding_vulkan.h
index faea7bd9266..bc6a235ea1c 100644
--- a/src/layer/vulkan/padding_vulkan.h
+++ b/src/layer/vulkan/padding_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_vulkan : virtual public Padding
+class Padding_vulkan : public Padding
 {
 public:
     Padding_vulkan();
diff --git a/src/layer/vulkan/permute_vulkan.h b/src/layer/vulkan/permute_vulkan.h
index c9fc6cfdef1..fd073bec245 100644
--- a/src/layer/vulkan/permute_vulkan.h
+++ b/src/layer/vulkan/permute_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Permute_vulkan : virtual public Permute
+class Permute_vulkan : public Permute
 {
 public:
     Permute_vulkan();
diff --git a/src/layer/vulkan/pixelshuffle_vulkan.h b/src/layer/vulkan/pixelshuffle_vulkan.h
index f24e2dd53b1..d0b812f2bb5 100644
--- a/src/layer/vulkan/pixelshuffle_vulkan.h
+++ b/src/layer/vulkan/pixelshuffle_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PixelShuffle_vulkan : virtual public PixelShuffle
+class PixelShuffle_vulkan : public PixelShuffle
 {
 public:
     PixelShuffle_vulkan();
diff --git a/src/layer/vulkan/pooling_vulkan.cpp b/src/layer/vulkan/pooling_vulkan.cpp
index eeba214ccac..ee7a9093301 100644
--- a/src/layer/vulkan/pooling_vulkan.cpp
+++ b/src/layer/vulkan/pooling_vulkan.cpp
@@ -128,7 +128,7 @@ int Pooling_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        padding = ncnn::create_layer(ncnn::LayerType::Padding);
+        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
         padding->vkdev = vkdev;
 
         padding->bottom_shapes.resize(1);
diff --git a/src/layer/vulkan/pooling_vulkan.h b/src/layer/vulkan/pooling_vulkan.h
index a3529b2708c..a336908d5d7 100644
--- a/src/layer/vulkan/pooling_vulkan.h
+++ b/src/layer/vulkan/pooling_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Pooling_vulkan : virtual public Pooling
+class Pooling_vulkan : public Pooling
 {
 public:
     Pooling_vulkan();
diff --git a/src/layer/vulkan/prelu_vulkan.h b/src/layer/vulkan/prelu_vulkan.h
index a58f7ce00b3..d2bae5eaac6 100644
--- a/src/layer/vulkan/prelu_vulkan.h
+++ b/src/layer/vulkan/prelu_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_vulkan : virtual public PReLU
+class PReLU_vulkan : public PReLU
 {
 public:
     PReLU_vulkan();
diff --git a/src/layer/vulkan/priorbox_vulkan.h b/src/layer/vulkan/priorbox_vulkan.h
index 5b11387e0f5..394b12d0fa9 100644
--- a/src/layer/vulkan/priorbox_vulkan.h
+++ b/src/layer/vulkan/priorbox_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PriorBox_vulkan : virtual public PriorBox
+class PriorBox_vulkan : public PriorBox
 {
 public:
     PriorBox_vulkan();
diff --git a/src/layer/vulkan/relu_vulkan.h b/src/layer/vulkan/relu_vulkan.h
index 7ac8fa76ae0..287781fdaa6 100644
--- a/src/layer/vulkan/relu_vulkan.h
+++ b/src/layer/vulkan/relu_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_vulkan : virtual public ReLU
+class ReLU_vulkan : public ReLU
 {
 public:
     ReLU_vulkan();
diff --git a/src/layer/vulkan/reorg_vulkan.h b/src/layer/vulkan/reorg_vulkan.h
index 1be2ade3601..f1565486996 100644
--- a/src/layer/vulkan/reorg_vulkan.h
+++ b/src/layer/vulkan/reorg_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Reorg_vulkan : virtual public Reorg
+class Reorg_vulkan : public Reorg
 {
 public:
     Reorg_vulkan();
diff --git a/src/layer/vulkan/reshape_vulkan.cpp b/src/layer/vulkan/reshape_vulkan.cpp
index 567acc6651d..e33efca47cc 100644
--- a/src/layer/vulkan/reshape_vulkan.cpp
+++ b/src/layer/vulkan/reshape_vulkan.cpp
@@ -121,7 +121,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
     if (need_permute)
     {
         {
-            permute_wh = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_wh = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_wh->vkdev = vkdev;
 
             permute_wh->bottom_shapes.resize(1);
@@ -137,7 +137,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
             permute_wh->create_pipeline(opt);
         }
         {
-            permute_hwc = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_hwc = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_hwc->vkdev = vkdev;
 
             permute_hwc->bottom_shapes.resize(1);
@@ -153,7 +153,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
             permute_hwc->create_pipeline(opt);
         }
         {
-            permute_dhwc = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_dhwc = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_dhwc->vkdev = vkdev;
 
             permute_dhwc->bottom_shapes.resize(1);
@@ -171,7 +171,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
 
         if (ndim == 2)
         {
-            permute_hw = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_hw = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_hw->vkdev = vkdev;
 
             permute_hw->bottom_shapes.resize(1);
@@ -188,7 +188,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
         }
         if (ndim == 3)
         {
-            permute_chw = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_chw = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_chw->vkdev = vkdev;
 
             permute_chw->bottom_shapes.resize(1);
@@ -205,7 +205,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
         }
         if (ndim == 4)
         {
-            permute_cdhw = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_cdhw = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_cdhw->vkdev = vkdev;
 
             permute_cdhw->bottom_shapes.resize(1);
diff --git a/src/layer/vulkan/reshape_vulkan.h b/src/layer/vulkan/reshape_vulkan.h
index 134ae1b9ece..6b408f79940 100644
--- a/src/layer/vulkan/reshape_vulkan.h
+++ b/src/layer/vulkan/reshape_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Reshape_vulkan : virtual public Reshape
+class Reshape_vulkan : public Reshape
 {
 public:
     Reshape_vulkan();
diff --git a/src/layer/vulkan/scale_vulkan.h b/src/layer/vulkan/scale_vulkan.h
index 867667e3da3..72851030d2d 100644
--- a/src/layer/vulkan/scale_vulkan.h
+++ b/src/layer/vulkan/scale_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Scale_vulkan : virtual public Scale
+class Scale_vulkan : public Scale
 {
 public:
     Scale_vulkan();
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
index 79641acbc40..cf6361e981f 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
@@ -80,10 +80,10 @@ void main()
     const int lxd16 = lx / 16; // 0 1
     const int lxm16 = lx % 16; // 0 1 2 3 .... 15
 
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;
 
     if (bias_term == 1)
     {
@@ -93,17 +93,24 @@ void main()
         coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
         coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
 
+#if NCNN_fp16_arithmetic
+        sum0 = bias0;
+        sum1 = bias0;
+        sum2 = bias1;
+        sum3 = bias1;
+#else
         sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
         sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
         sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
         sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
+#endif
     }
     else
     {
-        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
-        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
-        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
-        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
     }
 
     const int N = psc(c) / 4;
@@ -201,6 +208,12 @@ void main()
     if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#else
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
@@ -210,6 +223,7 @@ void main()
     coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
index 3c82d995202..6d9d9ce7b88 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
@@ -82,14 +82,14 @@ void main()
     const int lxd8 = lx / 8; // 0 1 2 3
     const int lxm8 = lx % 8; // 0 1 2 3 .... 7
 
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;
 
     if (bias_term == 1)
     {
@@ -103,6 +103,16 @@ void main()
         coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
         coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor);
 
+#if NCNN_fp16_arithmetic
+        sum0 = bias0;
+        sum1 = bias0;
+        sum2 = bias1;
+        sum3 = bias1;
+        sum4 = bias2;
+        sum5 = bias2;
+        sum6 = bias3;
+        sum7 = bias3;
+#else
         sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
         sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
         sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias1);
@@ -111,17 +121,18 @@ void main()
         sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias2);
         sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
         sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
+#endif
     }
     else
     {
-        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-        sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum4 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum5 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum6 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum7 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     }
 
     const int N = psc(c) / 2;
@@ -247,6 +258,16 @@ void main()
     if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum4, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum5, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum6, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum7, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#else
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
@@ -264,6 +285,7 @@ void main()
     coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
index 2c0f57e708c..4ec7f1b3f42 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
@@ -80,10 +80,17 @@ void main()
     const int lxd16 = lx / 16; // 0 1
     const int lxm16 = lx % 16; // 0 1 2 3 .... 15
 
+#if NCNN_fp16_arithmetic
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3;
+#else
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
+#endif
 
     if (bias_term == 1)
     {
@@ -93,17 +100,31 @@ void main()
         coopMatLoadNV(bias0, bias_data, gy, 0, false);
         coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);
 
+#if NCNN_fp16_arithmetic
+        sum0 = bias0;
+        sum1 = bias0;
+        sum2 = bias1;
+        sum3 = bias1;
+#else
         sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
         sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
         sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
         sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+#endif
     }
     else
     {
+#if NCNN_fp16_arithmetic
+        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+#else
         sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
         sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
         sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
         sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+#endif
     }
 
     const int N = psc(c) / 4;
@@ -201,6 +222,12 @@ void main()
     if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
+#else
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
@@ -210,6 +237,7 @@ void main()
     coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
     coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
     coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
index 97322e6ed9e..e3580695f9d 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
@@ -82,6 +82,16 @@ void main()
     const int lxd8 = lx / 8; // 0 1 2 3
     const int lxm8 = lx % 8; // 0 1 2 3 .... 7
 
+#if NCNN_fp16_arithmetic
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7;
+#else
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
@@ -90,6 +100,7 @@ void main()
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;
+#endif
 
     if (bias_term == 1)
     {
@@ -103,6 +114,16 @@ void main()
         coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
         coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);
 
+#if NCNN_fp16_arithmetic
+        sum0 = bias0;
+        sum1 = bias0;
+        sum2 = bias1;
+        sum3 = bias1;
+        sum4 = bias2;
+        sum5 = bias2;
+        sum6 = bias3;
+        sum7 = bias3;
+#else
         sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
         sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
         sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
@@ -111,9 +132,20 @@ void main()
         sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
         sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
         sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
+#endif
     }
     else
     {
+#if NCNN_fp16_arithmetic
+        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+#else
         sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
         sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
         sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
@@ -122,6 +154,7 @@ void main()
         sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
         sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
         sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+#endif
     }
 
     const int N = psc(c) / 2;
@@ -247,6 +280,16 @@ void main()
     if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
+#else
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
@@ -264,6 +307,7 @@ void main()
     coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
     coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
     coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
index c4a494e917a..4482ed18e10 100644
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
@@ -68,10 +68,10 @@ void main()
     const int lxd16 = lx / 16; // 0 1
     const int lxm16 = lx % 16; // 0 1 2 3 .... 15
 
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
-    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
 
     const int N = psc(c) / 4;
 
@@ -168,6 +168,12 @@ void main()
     if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#else
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
@@ -177,6 +183,7 @@ void main()
     coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
index 785c917bbf4..ea5aa316b8c 100644
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
@@ -70,14 +70,14 @@ void main()
     const int lxd8 = lx / 8; // 0 1 2 3
     const int lxm8 = lx % 8; // 0 1 2 3 .... 7
 
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
-    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
 
     const int N = psc(c) / 2;
 
@@ -202,6 +202,16 @@ void main()
     if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum4, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum5, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum6, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum7, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#else
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
@@ -219,6 +229,7 @@ void main()
     coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
index bcca39eb615..1cf40a5917d 100644
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
@@ -68,10 +68,17 @@ void main()
     const int lxd16 = lx / 16; // 0 1
     const int lxm16 = lx % 16; // 0 1 2 3 .... 15
 
+#if NCNN_fp16_arithmetic
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+#else
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+#endif
 
     const int N = psc(c) / 4;
 
@@ -168,6 +175,12 @@ void main()
     if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
+#else
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
@@ -177,6 +190,7 @@ void main()
     coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
     coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
     coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
index 35d3b4faba5..bcf46eea78c 100644
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
@@ -70,6 +70,16 @@ void main()
     const int lxd8 = lx / 8; // 0 1 2 3
     const int lxm8 = lx % 8; // 0 1 2 3 .... 7
 
+#if NCNN_fp16_arithmetic
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+#else
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
@@ -78,6 +88,7 @@ void main()
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
     fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+#endif
 
     const int N = psc(c) / 2;
 
@@ -202,6 +213,16 @@ void main()
     if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
         return;
 
+#if NCNN_fp16_arithmetic
+    coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
+#else
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
     fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
@@ -219,6 +240,7 @@ void main()
     coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
     coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
     coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shufflechannel_vulkan.h b/src/layer/vulkan/shufflechannel_vulkan.h
index 183e45ddaf7..1cbc706ba02 100644
--- a/src/layer/vulkan/shufflechannel_vulkan.h
+++ b/src/layer/vulkan/shufflechannel_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ShuffleChannel_vulkan : virtual public ShuffleChannel
+class ShuffleChannel_vulkan : public ShuffleChannel
 {
 public:
     ShuffleChannel_vulkan();
diff --git a/src/layer/vulkan/sigmoid_vulkan.h b/src/layer/vulkan/sigmoid_vulkan.h
index 2d244506f4e..1350f6a47d4 100644
--- a/src/layer/vulkan/sigmoid_vulkan.h
+++ b/src/layer/vulkan/sigmoid_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_vulkan : virtual public Sigmoid
+class Sigmoid_vulkan : public Sigmoid
 {
 public:
     Sigmoid_vulkan();
diff --git a/src/layer/vulkan/slice_vulkan.h b/src/layer/vulkan/slice_vulkan.h
index 53793752baa..92f9ad154b1 100644
--- a/src/layer/vulkan/slice_vulkan.h
+++ b/src/layer/vulkan/slice_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Slice_vulkan : virtual public Slice
+class Slice_vulkan : public Slice
 {
 public:
     Slice_vulkan();
diff --git a/src/layer/vulkan/softmax_vulkan.h b/src/layer/vulkan/softmax_vulkan.h
index 35478d2da24..aeff8d40be3 100644
--- a/src/layer/vulkan/softmax_vulkan.h
+++ b/src/layer/vulkan/softmax_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_vulkan : virtual public Softmax
+class Softmax_vulkan : public Softmax
 {
 public:
     Softmax_vulkan();
diff --git a/src/layer/vulkan/split_vulkan.cpp b/src/layer/vulkan/split_vulkan.cpp
new file mode 100644
index 00000000000..791069cc7d7
--- /dev/null
+++ b/src/layer/vulkan/split_vulkan.cpp
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "split_vulkan.h"
+
+namespace ncnn {
+
+Split_vulkan::Split_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+}
+
+int Split_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+    const VkMat& bottom_blob = bottom_blobs[0];
+    for (size_t i = 0; i < top_blobs.size(); i++)
+    {
+        top_blobs[i] = bottom_blob;
+    }
+
+    return 0;
+}
+
+int Split_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+    const VkImageMat& bottom_blob = bottom_blobs[0];
+    for (size_t i = 0; i < top_blobs.size(); i++)
+    {
+        top_blobs[i] = bottom_blob;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/split_vulkan.h b/src/layer/vulkan/split_vulkan.h
new file mode 100644
index 00000000000..8e1998a3a93
--- /dev/null
+++ b/src/layer/vulkan/split_vulkan.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SPLIT_VULKAN_H
+#define LAYER_SPLIT_VULKAN_H
+
+#include "split.h"
+
+namespace ncnn {
+
+class Split_vulkan : public Split
+{
+public:
+    Split_vulkan();
+
+    using Split::forward;
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SPLIT_VULKAN_H
diff --git a/src/layer/vulkan/swish_vulkan.h b/src/layer/vulkan/swish_vulkan.h
index f8d7c9f7707..a562767cbba 100644
--- a/src/layer/vulkan/swish_vulkan.h
+++ b/src/layer/vulkan/swish_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_vulkan : virtual public Swish
+class Swish_vulkan : public Swish
 {
 public:
     Swish_vulkan();
diff --git a/src/layer/vulkan/tanh_vulkan.h b/src/layer/vulkan/tanh_vulkan.h
index cccb2701483..1926363a0f8 100644
--- a/src/layer/vulkan/tanh_vulkan.h
+++ b/src/layer/vulkan/tanh_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_vulkan : virtual public TanH
+class TanH_vulkan : public TanH
 {
 public:
     TanH_vulkan();
diff --git a/src/layer/vulkan/unaryop_vulkan.h b/src/layer/vulkan/unaryop_vulkan.h
index c1d99873889..bad5377f9b3 100644
--- a/src/layer/vulkan/unaryop_vulkan.h
+++ b/src/layer/vulkan/unaryop_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_vulkan : virtual public UnaryOp
+class UnaryOp_vulkan : public UnaryOp
 {
 public:
     UnaryOp_vulkan();
diff --git a/src/layer/x86/batchnorm_x86.h b/src/layer/x86/batchnorm_x86.h
index b991e313c3e..7168332a1b3 100644
--- a/src/layer/x86/batchnorm_x86.h
+++ b/src/layer/x86/batchnorm_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BatchNorm_x86 : virtual public BatchNorm
+class BatchNorm_x86 : public BatchNorm
 {
 public:
     BatchNorm_x86();
diff --git a/src/layer/x86/bias_x86.h b/src/layer/x86/bias_x86.h
index 39d1bcef492..ab8e30de56d 100644
--- a/src/layer/x86/bias_x86.h
+++ b/src/layer/x86/bias_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Bias_x86 : virtual public Bias
+class Bias_x86 : public Bias
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/x86/binaryop_x86.h b/src/layer/x86/binaryop_x86.h
index 9f3ebb3cac9..cd3ff12a989 100644
--- a/src/layer/x86/binaryop_x86.h
+++ b/src/layer/x86/binaryop_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BinaryOp_x86 : virtual public BinaryOp
+class BinaryOp_x86 : public BinaryOp
 {
 public:
     BinaryOp_x86();
diff --git a/src/layer/x86/bnll_x86.h b/src/layer/x86/bnll_x86.h
index ac7536b75bf..b3fad45ca7d 100644
--- a/src/layer/x86/bnll_x86.h
+++ b/src/layer/x86/bnll_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BNLL_x86 : virtual public BNLL
+class BNLL_x86 : public BNLL
 {
 public:
     BNLL_x86();
diff --git a/src/layer/x86/cast_x86.h b/src/layer/x86/cast_x86.h
index bd1ec503382..45b27a8c6ce 100644
--- a/src/layer/x86/cast_x86.h
+++ b/src/layer/x86/cast_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_x86 : virtual public Cast
+class Cast_x86 : public Cast
 {
 public:
     Cast_x86();
diff --git a/src/layer/x86/clip_x86.h b/src/layer/x86/clip_x86.h
index be026777f08..45a4058e90e 100644
--- a/src/layer/x86/clip_x86.h
+++ b/src/layer/x86/clip_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_x86 : virtual public Clip
+class Clip_x86 : public Clip
 {
 public:
     Clip_x86();
diff --git a/src/layer/x86/concat_x86.h b/src/layer/x86/concat_x86.h
index 054d4b784d9..28ff162dbdc 100644
--- a/src/layer/x86/concat_x86.h
+++ b/src/layer/x86/concat_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_x86 : virtual public Concat
+class Concat_x86 : public Concat
 {
 public:
     Concat_x86();
diff --git a/src/layer/x86/convolution1d_x86.cpp b/src/layer/x86/convolution1d_x86.cpp
index e7df16b8316..905db18b728 100644
--- a/src/layer/x86/convolution1d_x86.cpp
+++ b/src/layer/x86/convolution1d_x86.cpp
@@ -43,6 +43,8 @@ int Convolution1D_x86::create_pipeline(const Option& /*opt*/)
 
     convolution1d_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w);
 
+    weight_data.release();
+
     return 0;
 }
 
@@ -126,7 +128,7 @@ int Convolution1D_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/x86/convolution1d_x86.h b/src/layer/x86/convolution1d_x86.h
index ec1782b7063..497b34e5962 100644
--- a/src/layer/x86/convolution1d_x86.h
+++ b/src/layer/x86/convolution1d_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_x86 : virtual public Convolution1D
+class Convolution1D_x86 : public Convolution1D
 {
 public:
     Convolution1D_x86();
diff --git a/src/layer/x86/convolution_3x3_winograd_int8.h b/src/layer/x86/convolution_3x3_winograd_int8.h
index 8c7b891b0dd..94ea79d4540 100644
--- a/src/layer/x86/convolution_3x3_winograd_int8.h
+++ b/src/layer/x86/convolution_3x3_winograd_int8.h
@@ -3544,10 +3544,10 @@ static inline void conv3x3s1_winograd23_transform_input_tile_int8(const Mat& bot
                     {
                         __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                         _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                        _r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, sizeof(signed char))));
-                        if (tj * 2 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), sizeof(signed char))));
-                        if (tj * 2 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), sizeof(signed char))));
-                        if (tj * 2 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), sizeof(signed char))));
+                        _r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, 1)));
+                        if (tj * 2 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), 1)));
+                        if (tj * 2 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), 1)));
+                        if (tj * 2 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), 1)));
                     }
                 }
 
@@ -3653,28 +3653,28 @@ static inline void conv3x3s1_winograd23_transform_input_tile_int8(const Mat& bot
                         __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                         _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                        _r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char))));
-                        if (tj * 2 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char))));
-                        if (tj * 2 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char))));
-                        if (tj * 2 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char))));
+                        _r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1)));
+                        if (tj * 2 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1)));
+                        if (tj * 2 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1)));
+                        if (tj * 2 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1)));
 #else
                         __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                         __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                        __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char)), _sindex88);
+                        __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1), _sindex88);
                         _r0 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1)));
                         if (tj * 2 + 1 < w)
                         {
-                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1), _sindex88);
                             _r1 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1)));
                         }
                         if (tj * 2 + 2 < w)
                         {
-                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1), _sindex88);
                             _r2 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1)));
                         }
                         if (tj * 2 + 3 < w)
                         {
-                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1), _sindex88);
                             _r3 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val3_32, 0), _mm256_extracti128_si256(_val3_32, 1)));
                         }
 #endif // __AVX512F__
@@ -4768,12 +4768,12 @@ static inline void conv3x3s1_winograd43_transform_input_tile_int8(const Mat& bot
                     {
                         __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                         _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                        _r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, sizeof(signed char))));
-                        if (tj * 4 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), sizeof(signed char))));
-                        if (tj * 4 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), sizeof(signed char))));
-                        if (tj * 4 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), sizeof(signed char))));
-                        if (tj * 4 + 4 < w) _r4 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 4), sizeof(signed char))));
-                        if (tj * 4 + 5 < w) _r5 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 5), sizeof(signed char))));
+                        _r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, 1)));
+                        if (tj * 4 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), 1)));
+                        if (tj * 4 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), 1)));
+                        if (tj * 4 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), 1)));
+                        if (tj * 4 + 4 < w) _r4 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 4), 1)));
+                        if (tj * 4 + 5 < w) _r5 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 5), 1)));
                     }
                 }
 
@@ -4919,40 +4919,40 @@ static inline void conv3x3s1_winograd43_transform_input_tile_int8(const Mat& bot
                         __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                         _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                        _r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char))));
-                        if (tj * 4 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char))));
-                        if (tj * 4 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char))));
-                        if (tj * 4 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char))));
-                        if (tj * 4 + 4 < w) _r4 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, sizeof(signed char))));
-                        if (tj * 4 + 5 < w) _r5 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, sizeof(signed char))));
+                        _r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1)));
+                        if (tj * 4 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1)));
+                        if (tj * 4 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1)));
+                        if (tj * 4 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1)));
+                        if (tj * 4 + 4 < w) _r4 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, 1)));
+                        if (tj * 4 + 5 < w) _r5 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, 1)));
 #else
                         __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                         __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                        __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char)), _sindex88);
+                        __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1), _sindex88);
                         _r0 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1)));
                         if (tj * 4 + 1 < w)
                         {
-                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1), _sindex88);
                             _r1 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1)));
                         }
                         if (tj * 4 + 2 < w)
                         {
-                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1), _sindex88);
                             _r2 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1)));
                         }
                         if (tj * 4 + 3 < w)
                         {
-                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1), _sindex88);
                             _r3 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val3_32, 0), _mm256_extracti128_si256(_val3_32, 1)));
                         }
                         if (tj * 4 + 4 < w)
                         {
-                            __m256i _val4_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val4_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, 1), _sindex88);
                             _r4 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val4_32, 0), _mm256_extracti128_si256(_val4_32, 1)));
                         }
                         if (tj * 4 + 5 < w)
                         {
-                            __m256i _val5_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val5_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, 1), _sindex88);
                             _r5 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val5_32, 0), _mm256_extracti128_si256(_val5_32, 1)));
                         }
 #endif // __AVX512F__
diff --git a/src/layer/x86/convolution_im2col_gemm_int8.h b/src/layer/x86/convolution_im2col_gemm_int8.h
index ef8fe0ab920..e72dd8882dd 100644
--- a/src/layer/x86/convolution_im2col_gemm_int8.h
+++ b/src/layer/x86/convolution_im2col_gemm_int8.h
@@ -4871,7 +4871,6 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
                     _sum0 = _mm512_unpacklo_epi64(_tmp0, _tmp1);
                     _sum1 = _mm512_unpackhi_epi64(_tmp0, _tmp1);
 
-                    _sum0 = _sum0;
                     _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
 
                     // 0123 4567 89ab cdef  x 0
diff --git a/src/layer/x86/convolution_packed_int8.h b/src/layer/x86/convolution_packed_int8.h
index 6217f8bf5bd..46c03f0ca9b 100644
--- a/src/layer/x86/convolution_packed_int8.h
+++ b/src/layer/x86/convolution_packed_int8.h
@@ -169,22 +169,22 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
 
             for (int k = 0; k < maxk; k++)
             {
-                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
-                __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), sizeof(signed char)));
-                __m128i _w2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr2 + k), sizeof(signed char)));
-                __m128i _w3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr3 + k), sizeof(signed char)));
-                __m128i _w4 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr4 + k), sizeof(signed char)));
-                __m128i _w5 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr5 + k), sizeof(signed char)));
-                __m128i _w6 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr6 + k), sizeof(signed char)));
-                __m128i _w7 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr7 + k), sizeof(signed char)));
-                __m128i _w8 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr8 + k), sizeof(signed char)));
-                __m128i _w9 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr9 + k), sizeof(signed char)));
-                __m128i _wa = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptra + k), sizeof(signed char)));
-                __m128i _wb = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrb + k), sizeof(signed char)));
-                __m128i _wc = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrc + k), sizeof(signed char)));
-                __m128i _wd = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrd + k), sizeof(signed char)));
-                __m128i _we = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptre + k), sizeof(signed char)));
-                __m128i _wf = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrf + k), sizeof(signed char)));
+                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
+                __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), 1));
+                __m128i _w2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr2 + k), 1));
+                __m128i _w3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr3 + k), 1));
+                __m128i _w4 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr4 + k), 1));
+                __m128i _w5 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr5 + k), 1));
+                __m128i _w6 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr6 + k), 1));
+                __m128i _w7 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr7 + k), 1));
+                __m128i _w8 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr8 + k), 1));
+                __m128i _w9 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr9 + k), 1));
+                __m128i _wa = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptra + k), 1));
+                __m128i _wb = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrb + k), 1));
+                __m128i _wc = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrc + k), 1));
+                __m128i _wd = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrd + k), 1));
+                __m128i _we = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptre + k), 1));
+                __m128i _wf = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptrf + k), 1));
 
                 transpose8x16_epi16(_w0, _w1, _w2, _w3, _w4, _w5, _w6, _w7, _w8, _w9, _wa, _wb, _wc, _wd, _we, _wf);
 
@@ -231,22 +231,22 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
 
             for (int k = 0; k < maxk; k++)
             {
-                __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char)));
-                __m128i _w1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr1 + k), _vindex, sizeof(signed char)));
-                __m128i _w2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr2 + k), _vindex, sizeof(signed char)));
-                __m128i _w3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr3 + k), _vindex, sizeof(signed char)));
-                __m128i _w4 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr4 + k), _vindex, sizeof(signed char)));
-                __m128i _w5 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr5 + k), _vindex, sizeof(signed char)));
-                __m128i _w6 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr6 + k), _vindex, sizeof(signed char)));
-                __m128i _w7 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr7 + k), _vindex, sizeof(signed char)));
-                __m128i _w8 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr8 + k), _vindex, sizeof(signed char)));
-                __m128i _w9 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr9 + k), _vindex, sizeof(signed char)));
-                __m128i _wa = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptra + k), _vindex, sizeof(signed char)));
-                __m128i _wb = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrb + k), _vindex, sizeof(signed char)));
-                __m128i _wc = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrc + k), _vindex, sizeof(signed char)));
-                __m128i _wd = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrd + k), _vindex, sizeof(signed char)));
-                __m128i _we = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptre + k), _vindex, sizeof(signed char)));
-                __m128i _wf = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrf + k), _vindex, sizeof(signed char)));
+                __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1));
+                __m128i _w1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr1 + k), _vindex, 1));
+                __m128i _w2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr2 + k), _vindex, 1));
+                __m128i _w3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr3 + k), _vindex, 1));
+                __m128i _w4 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr4 + k), _vindex, 1));
+                __m128i _w5 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr5 + k), _vindex, 1));
+                __m128i _w6 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr6 + k), _vindex, 1));
+                __m128i _w7 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr7 + k), _vindex, 1));
+                __m128i _w8 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr8 + k), _vindex, 1));
+                __m128i _w9 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr9 + k), _vindex, 1));
+                __m128i _wa = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptra + k), _vindex, 1));
+                __m128i _wb = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrb + k), _vindex, 1));
+                __m128i _wc = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrc + k), _vindex, 1));
+                __m128i _wd = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrd + k), _vindex, 1));
+                __m128i _we = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptre + k), _vindex, 1));
+                __m128i _wf = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptrf + k), _vindex, 1));
 
                 __m128i _w08 = _mm_unpacklo_epi64(_w0, _w8);
                 __m128i _w19 = _mm_unpacklo_epi64(_w1, _w9);
@@ -296,8 +296,8 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
             __m512i _vindex = _mm512_inserti64x4(_mm512_castsi256_si512(_vindex01), _vindex23, 1);
             for (int k = 0; k < maxk; k++)
             {
-                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
-                __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr8 + k), sizeof(signed char)));
+                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
+                __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr8 + k), 1));
 
                 _mm_storeu_si128((__m128i*)g00, _w0);
                 _mm_storeu_si128((__m128i*)(g00 + 16), _w1);
@@ -313,7 +313,7 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(inch * maxk));
             for (int k = 0; k < maxk; k++)
             {
-                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
+                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
                 _mm_storeu_si128((__m128i*)g00, _w0);
                 g00 += 16;
             }
@@ -346,14 +346,14 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
 
             for (int k = 0; k < maxk; k++)
             {
-                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
-                __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), sizeof(signed char)));
-                __m128i _w2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr2 + k), sizeof(signed char)));
-                __m128i _w3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr3 + k), sizeof(signed char)));
-                __m128i _w4 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr4 + k), sizeof(signed char)));
-                __m128i _w5 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr5 + k), sizeof(signed char)));
-                __m128i _w6 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr6 + k), sizeof(signed char)));
-                __m128i _w7 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr7 + k), sizeof(signed char)));
+                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
+                __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), 1));
+                __m128i _w2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr2 + k), 1));
+                __m128i _w3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr3 + k), 1));
+                __m128i _w4 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr4 + k), 1));
+                __m128i _w5 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr5 + k), 1));
+                __m128i _w6 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr6 + k), 1));
+                __m128i _w7 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr7 + k), 1));
 
                 transpose8x8_epi16(_w0, _w1, _w2, _w3, _w4, _w5, _w6, _w7);
 
@@ -446,10 +446,10 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
             for (int k = 0; k < maxk; k++)
             {
 #if __AVX512F__
-                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
+                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
 #else
-                __m256i _w01 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, sizeof(signed char)), _sindex88);
-                __m256i _w23 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr4 + k), _vindex01, sizeof(signed char)), _sindex88);
+                __m256i _w01 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, 1), _sindex88);
+                __m256i _w23 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr4 + k), _vindex01, 1), _sindex88);
                 __m128i _w01xx = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w01, 0), _mm256_extracti128_si256(_w01, 1));
                 __m128i _w23xx = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w23, 0), _mm256_extracti128_si256(_w23, 1));
                 __m128i _w0 = _mm_unpacklo_epi64(_w01xx, _w23xx);
@@ -471,7 +471,7 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
 #endif
             for (int k = 0; k < maxk; k++)
             {
-                __m256i _w32 = _mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char));
+                __m256i _w32 = _mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1);
 #if __AVX512F__
                 __m128i _w0 = _mm256_cvtepi32_epi8(_w32);
 #else
@@ -583,10 +583,10 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
             for (int k = 0; k < maxk; k++)
             {
 #if __AVX512F__
-                __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, sizeof(signed char)));
+                __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, 1));
                 _mm_storel_epi64((__m128i*)g00, _w0);
 #elif __AVX2__
-                __m256i _w01 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, sizeof(signed char)), _sindex88);
+                __m256i _w01 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex01, 1), _sindex88);
                 __m128i _w0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w01, 0), _mm256_extracti128_si256(_w01, 1));
                 _mm_storel_epi64((__m128i*)g00, _w0);
 #else
@@ -624,10 +624,10 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
             for (int k = 0; k < maxk; k++)
             {
 #if __AVX512F__
-                __m128i _w0 = _mm_cvtepi32_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char)));
+                __m128i _w0 = _mm_cvtepi32_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1));
                 _mm_store_ss((float*)g00, _mm_castsi128_ps(_w0));
 #elif __AVX2__
-                __m128i _w0 = _mm_shuffle_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char)), _sindex8);
+                __m128i _w0 = _mm_shuffle_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1), _sindex8);
                 _mm_store_ss((float*)g00, _mm_castsi128_ps(_w0));
 #else
                 const signed char* k0 = kptr0 + k;
@@ -670,8 +670,8 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
 
             for (int k = 0; k < maxk; k++)
             {
-                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
-                __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), sizeof(signed char)));
+                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
+                __m128i _w1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr1 + k), 1));
 
                 _mm_storeu_si128((__m128i*)g00, _w0);
                 _mm_storeu_si128((__m128i*)(g00 + 16), _w1);
@@ -699,11 +699,11 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
             for (int k = 0; k < maxk; k++)
             {
 #if __AVX512F__
-                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), sizeof(signed char)));
+                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr0 + k), 1));
                 _mm_storeu_si128((__m128i*)g00, _w0);
 #elif __AVX2__
-                __m256i _w00 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex0, sizeof(signed char)), _sindex88);
-                __m256i _w11 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr1 + k), _vindex0, sizeof(signed char)), _sindex88);
+                __m256i _w00 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr0 + k), _vindex0, 1), _sindex88);
+                __m256i _w11 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr1 + k), _vindex0, 1), _sindex88);
                 __m128i _w0x = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w00, 0), _mm256_extracti128_si256(_w00, 1));
                 __m128i _w1x = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w11, 0), _mm256_extracti128_si256(_w11, 1));
                 __m128i _w0 = _mm_unpacklo_epi64(_w0x, _w1x);
@@ -748,10 +748,10 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
             for (int k = 0; k < maxk; k++)
             {
 #if __AVX512F__
-                __m128i _w0 = _mm_cvtepi32_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char)));
+                __m128i _w0 = _mm_cvtepi32_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1));
                 _mm_store_ss((float*)g00, _mm_castsi128_ps(_w0));
 #elif __AVX2__
-                __m128i _w0 = _mm_shuffle_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, sizeof(signed char)), _sindex8);
+                __m128i _w0 = _mm_shuffle_epi8(_mm_i32gather_epi32((const int*)(kptr0 + k), _vindex, 1), _sindex8);
                 _mm_store_ss((float*)g00, _mm_castsi128_ps(_w0));
 #else
                 const signed char* k0 = kptr0 + k;
@@ -805,7 +805,7 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
 
             for (int k = 0; k < maxk; k++)
             {
-                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr + k), sizeof(signed char)));
+                __m128i _w0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(kptr + k), 1));
 
                 _mm_storeu_si128((__m128i*)g00, _w0);
                 g00 += 16;
@@ -827,12 +827,12 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker
             for (int k = 0; k < maxk; k++)
             {
 #if __AVX512F__
-                __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr + k), _vindex, sizeof(signed char)));
+                __m128i _w0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(kptr + k), _vindex, 1));
 
                 _mm_storel_epi64((__m128i*)g00, _w0);
                 g00 += 8;
 #elif __AVX2__
-                __m256i _w00 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr + k), _vindex, sizeof(signed char)), _sindex88);
+                __m256i _w00 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(kptr + k), _vindex, 1), _sindex88);
                 __m128i _w0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_w00, 0), _mm256_extracti128_si256(_w00, 1));
 
                 _mm_storel_epi64((__m128i*)g00, _w0);
@@ -1029,10 +1029,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                     {
                         __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                         _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                        _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
-                        _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
-                        _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), sizeof(signed char)));
-                        _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), sizeof(signed char)));
+                        _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+                        _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
+                        _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), 1));
+                        _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), 1));
                     }
 
                     __m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -1163,10 +1163,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                     {
                         __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                         _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
-                        _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
-                        _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
-                        _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)));
-                        _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)));
+                        _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+                        _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
+                        _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1));
+                        _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1));
                     }
 
                     _r0 = _mm_cvtepi8_epi16(_r0);
@@ -1394,8 +1394,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                     {
                         __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                         _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                        _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
-                        _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
+                        _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+                        _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
                     }
 
                     __m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -1480,8 +1480,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                     {
                         __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                         _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
-                        _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
-                        _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
+                        _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+                        _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
                     }
 
                     _r0 = _mm_cvtepi8_epi16(_r0);
@@ -1649,7 +1649,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                     {
                         __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                         _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                        _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
+                        _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
                     }
 
                     __m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -1711,7 +1711,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                     {
                         __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                         _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
-                        _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
+                        _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
                     }
 
                     _r0 = _mm_cvtepi8_epi16(_r0);
@@ -1910,10 +1910,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         {
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
-                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
-                            _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), sizeof(signed char)));
-                            _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
+                            _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), 1));
+                            _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), 1));
                         }
 
                         __m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -2028,17 +2028,17 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                         _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                        _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
-                        _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
-                        _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)));
-                        _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)));
+                        _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+                        _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
+                        _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1));
+                        _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1));
 #else
                         __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                         __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                        __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
-                        __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
-                        __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)), _sindex88);
-                        __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)), _sindex88);
+                        __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+                        __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
+                        __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1), _sindex88);
+                        __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1), _sindex88);
                         _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
                         _r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
                         _r2 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1));
@@ -2317,8 +2317,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         {
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
-                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
                         }
 
                         __m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -2399,13 +2399,13 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                         _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                        _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
-                        _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
+                        _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+                        _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
 #else
                         __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                         __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                        __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
-                        __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
+                        __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+                        __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
                         _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
                         _r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
 #endif // __AVX512F__
@@ -2601,7 +2601,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         {
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
                         }
 
                         __m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -2666,11 +2666,11 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                             _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
+                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
 #else
                             __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                             __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                            __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
                             _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val32, 0), _mm256_extracti128_si256(_val32, 1));
 #endif // __AVX512F__
 #else
@@ -2882,10 +2882,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         {
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
-                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
-                            _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), sizeof(signed char)));
-                            _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
+                            _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), 1));
+                            _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), 1));
                         }
 
                         __m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -2997,17 +2997,17 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                             _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
-                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
-                            _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)));
-                            _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)));
+                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
+                            _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1));
+                            _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1));
 #else
                             __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                             __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
+                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1), _sindex88);
+                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1), _sindex88);
                             _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
                             _r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
                             _r2 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1));
@@ -3337,8 +3337,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         {
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
-                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
                         }
 
                         __m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -3422,13 +3422,13 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                             _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
-                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
+                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
 #else
                             __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                             __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
                             _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
                             _r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
 #endif // __AVX512F__
@@ -3669,7 +3669,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         {
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
                         }
 
                         __m256i _rr0 = _mm256_cvtepi8_epi16(_r0);
@@ -3733,11 +3733,11 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                             _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
+                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
 #else
                             __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                             __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                            __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
                             _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val32, 0), _mm256_extracti128_si256(_val32, 1));
 #endif // __AVX512F__
 #else
@@ -3986,10 +3986,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         {
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
-                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
-                            _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), sizeof(signed char)));
-                            _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
+                            _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), 1));
+                            _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), 1));
                         }
 
                         __m256i _val0 = _mm256_cvtepi8_epi16(_r0);
@@ -4077,17 +4077,17 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                             _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
-                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
-                            _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)));
-                            _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)));
+                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
+                            _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1));
+                            _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1));
 #else
                             __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                             __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
+                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1), _sindex88);
+                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1), _sindex88);
                             _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
                             _r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
                             _r2 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1));
@@ -4324,8 +4324,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         {
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
-                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
                         }
 
                         __m256i _val0 = _mm256_cvtepi8_epi16(_r0);
@@ -4390,13 +4390,13 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                             _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
-                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
+                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
 #else
                             __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                             __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
                             _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
                             _r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
 #endif // __AVX512F__
@@ -4562,7 +4562,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         {
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
                         }
 
                         __m256i _val = _mm256_cvtepi8_epi16(_r0);
@@ -4612,11 +4612,11 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                             _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
+                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
 #else
                             __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                             __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                            __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
                             _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val32, 0), _mm256_extracti128_si256(_val32, 1));
 #endif // __AVX512F__
 #else
@@ -4790,10 +4790,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
 
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
-                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
-                            _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), sizeof(signed char)));
-                            _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
+                            _r2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r2s), 1));
+                            _r3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r3s), 1));
                         }
 
                         __m256i _val0 = _mm256_cvtepi8_epi16(_r0);
@@ -4861,18 +4861,18 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                             _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
-                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
-                            _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)));
-                            _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)));
+                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
+                            _r2 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1));
+                            _r3 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1));
 #else
                             __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                             __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
 
-                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
+                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r2s, _vindex, 1), _sindex88);
+                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r3s, _vindex, 1), _sindex88);
                             _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
                             _r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
                             _r2 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1));
@@ -5071,8 +5071,8 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
 
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
-                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
+                            _r1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r1s), 1));
                         }
 
                         __m256i _val0 = _mm256_cvtepi8_epi16(_r0);
@@ -5124,14 +5124,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                             _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
-                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)));
+                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
+                            _r1 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1));
 #else
                             __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                             __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
 
-                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
-                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
+                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r1s, _vindex, 1), _sindex88);
                             _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1));
                             _r1 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1));
 #endif
@@ -5264,7 +5264,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                         {
                             __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
                             _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
-                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), sizeof(signed char)));
+                            _r0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0s), 1));
                         }
 
                         __m256i _val = _mm256_cvtepi8_epi16(_r0);
@@ -5306,11 +5306,11 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const
                             __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
                             _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
 #if __AVX512F__
-                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)));
+                            _r0 = _mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1));
 #else
                             __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
                             __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
-                            __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, sizeof(signed char)), _sindex88);
+                            __m256i _val32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0s, _vindex, 1), _sindex88);
                             _r0 = _mm_unpacklo_epi32(_mm256_extracti128_si256(_val32, 0), _mm256_extracti128_si256(_val32, 1));
 #endif // __AVX512F__
 #else
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 6e828ff0d21..c1f354ea6de 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -297,7 +297,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
 
     if (!opt.use_packing_layout && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
     {
-        convolution_dilation1 = ncnn::create_layer(ncnn::LayerType::Convolution);
+        convolution_dilation1 = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -334,10 +334,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
 
         convolution_dilation1->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -454,10 +451,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -469,7 +463,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
     {
         const int maxk = kernel_w * kernel_h;
 
-        gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
         ncnn::ParamDict pd;
         pd.set(2, 0);                   // transA
@@ -548,10 +542,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1182,7 +1173,7 @@ int Convolution_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -1259,10 +1250,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1279,7 +1267,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
         quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
     }
 
-    //     NCNN_LOGE("Convolution_arm input %d x %d  ksize=%d %d  stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);
+    //     NCNN_LOGE("Convolution_x86 input %d x %d  ksize=%d %d  stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);
 
     Mat bottom_blob_bordered;
     make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
@@ -1310,7 +1298,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
 #endif // __SSE2__
     size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
 
-    //     NCNN_LOGE("forward_int8_arm %d %d %d    %d %d", w, h, bottom_blob_bordered.c, elempack, out_elempack);
+    //     NCNN_LOGE("forward_int8_x86 %d %d %d    %d %d", w, h, bottom_blob_bordered.c, elempack, out_elempack);
 
     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
     if (top_blob.empty())
diff --git a/src/layer/x86/convolution_x86.h b/src/layer/x86/convolution_x86.h
index 44889ef5a3c..fdfa88f7374 100644
--- a/src/layer/x86/convolution_x86.h
+++ b/src/layer/x86/convolution_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_x86 : virtual public Convolution
+class Convolution_x86 : public Convolution
 {
 public:
     Convolution_x86();
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index c27778b3fe2..6a9fb7fb4c2 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -132,10 +132,7 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -143,10 +140,7 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -174,7 +168,7 @@ int ConvolutionDepthWise_x86::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -780,7 +774,7 @@ int ConvolutionDepthWise_x86::forward(const std::vector<Mat>& bottom_blobs, std:
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
@@ -849,16 +843,15 @@ int ConvolutionDepthWise_x86::create_pipeline_int8_x86(const Option& opt)
             weight_data_tm = weight_data;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/convolutiondepthwise_x86.h b/src/layer/x86/convolutiondepthwise_x86.h
index 6fe066e5bed..1fedb119bd3 100644
--- a/src/layer/x86/convolutiondepthwise_x86.h
+++ b/src/layer/x86/convolutiondepthwise_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_x86 : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_x86 : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_x86();
diff --git a/src/layer/x86/crop_x86.h b/src/layer/x86/crop_x86.h
index e7e3d140fc5..ba0fc1b607e 100644
--- a/src/layer/x86/crop_x86.h
+++ b/src/layer/x86/crop_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_x86 : virtual public Crop
+class Crop_x86 : public Crop
 {
 public:
     Crop_x86();
diff --git a/src/layer/x86/deconvolution_x86.cpp b/src/layer/x86/deconvolution_x86.cpp
index 46bdca2a397..6a94104a43d 100644
--- a/src/layer/x86/deconvolution_x86.cpp
+++ b/src/layer/x86/deconvolution_x86.cpp
@@ -94,7 +94,7 @@ int Deconvolution_x86::create_pipeline(const Option& opt)
     {
         const int maxk = kernel_w * kernel_h;
 
-        gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
         ncnn::ParamDict pd;
         pd.set(2, 1);                 // transA
@@ -193,10 +193,7 @@ int Deconvolution_x86::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -694,7 +691,7 @@ int Deconvolution_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/x86/deconvolution_x86.h b/src/layer/x86/deconvolution_x86.h
index 4951870bcd0..66c23eef3f3 100644
--- a/src/layer/x86/deconvolution_x86.h
+++ b/src/layer/x86/deconvolution_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_x86 : virtual public Deconvolution
+class Deconvolution_x86 : public Deconvolution
 {
 public:
     Deconvolution_x86();
diff --git a/src/layer/x86/deconvolutiondepthwise_x86.cpp b/src/layer/x86/deconvolutiondepthwise_x86.cpp
index 43a573a64ef..4a1e89d26a8 100644
--- a/src/layer/x86/deconvolutiondepthwise_x86.cpp
+++ b/src/layer/x86/deconvolutiondepthwise_x86.cpp
@@ -109,16 +109,15 @@ int DeconvolutionDepthWise_x86::create_pipeline(const Option& opt)
             weight_data_tm = weight_data_transposed;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -146,7 +145,7 @@ int DeconvolutionDepthWise_x86::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -641,7 +640,7 @@ int DeconvolutionDepthWise_x86::forward(const std::vector<Mat>& bottom_blobs, st
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/x86/deconvolutiondepthwise_x86.h b/src/layer/x86/deconvolutiondepthwise_x86.h
index 07fb5e54f9b..9c9e54cccf4 100644
--- a/src/layer/x86/deconvolutiondepthwise_x86.h
+++ b/src/layer/x86/deconvolutiondepthwise_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_x86 : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_x86 : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_x86();
diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp
index 076e56f7e64..8fc7bdf2855 100644
--- a/src/layer/x86/deformableconv2d_x86.cpp
+++ b/src/layer/x86/deformableconv2d_x86.cpp
@@ -134,7 +134,7 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt)
     {
         const int maxk = kernel_w * kernel_h;
 
-        gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
         ncnn::ParamDict pd;
         pd.set(2, 0);                   // transA
@@ -203,10 +203,7 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt)
         deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/deformableconv2d_x86.h b/src/layer/x86/deformableconv2d_x86.h
index e5ab4e08c99..66cce21ab49 100644
--- a/src/layer/x86/deformableconv2d_x86.h
+++ b/src/layer/x86/deformableconv2d_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeformableConv2D_x86 : virtual public DeformableConv2D
+class DeformableConv2D_x86 : public DeformableConv2D
 {
 public:
     DeformableConv2D_x86();
diff --git a/src/layer/x86/dequantize_x86.h b/src/layer/x86/dequantize_x86.h
index 2d8a6a22b0a..52bfcaed22e 100644
--- a/src/layer/x86/dequantize_x86.h
+++ b/src/layer/x86/dequantize_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dequantize_x86 : virtual public Dequantize
+class Dequantize_x86 : public Dequantize
 {
 public:
     Dequantize_x86();
diff --git a/src/layer/x86/dropout_x86.h b/src/layer/x86/dropout_x86.h
index 959c9889e34..d44a8987162 100644
--- a/src/layer/x86/dropout_x86.h
+++ b/src/layer/x86/dropout_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dropout_x86 : virtual public Dropout
+class Dropout_x86 : public Dropout
 {
 public:
     Dropout_x86();
diff --git a/src/layer/x86/eltwise_x86.h b/src/layer/x86/eltwise_x86.h
index 0f4eac064e0..e941817a303 100644
--- a/src/layer/x86/eltwise_x86.h
+++ b/src/layer/x86/eltwise_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Eltwise_x86 : virtual public Eltwise
+class Eltwise_x86 : public Eltwise
 {
 public:
     Eltwise_x86();
diff --git a/src/layer/x86/elu_x86.h b/src/layer/x86/elu_x86.h
index cd49c4f7d5a..6da00490d21 100644
--- a/src/layer/x86/elu_x86.h
+++ b/src/layer/x86/elu_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ELU_x86 : virtual public ELU
+class ELU_x86 : public ELU
 {
 public:
     ELU_x86();
diff --git a/src/layer/x86/flatten_x86.h b/src/layer/x86/flatten_x86.h
index fcd512ae194..29820121695 100644
--- a/src/layer/x86/flatten_x86.h
+++ b/src/layer/x86/flatten_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_x86 : virtual public Flatten
+class Flatten_x86 : public Flatten
 {
 public:
     Flatten_x86();
diff --git a/src/layer/x86/gelu_x86.h b/src/layer/x86/gelu_x86.h
index 75d821bfd45..ba4b43e65ec 100644
--- a/src/layer/x86/gelu_x86.h
+++ b/src/layer/x86/gelu_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GELU_x86 : virtual public GELU
+class GELU_x86 : public GELU
 {
 public:
     GELU_x86();
diff --git a/src/layer/x86/gemm_x86.cpp b/src/layer/x86/gemm_x86.cpp
index 19cd7ebc09a..4ab37836a43 100644
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -7235,10 +7235,7 @@ int Gemm_x86::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -7282,10 +7279,7 @@ int Gemm_x86::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -7321,10 +7315,7 @@ int Gemm_x86::create_pipeline(const Option& opt)
             CT_data = C2;
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
diff --git a/src/layer/x86/gemm_x86.h b/src/layer/x86/gemm_x86.h
index ef14872d76e..6f8eb4a82bf 100644
--- a/src/layer/x86/gemm_x86.h
+++ b/src/layer/x86/gemm_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Gemm_x86 : virtual public Gemm
+class Gemm_x86 : public Gemm
 {
 public:
     Gemm_x86();
diff --git a/src/layer/x86/gridsample_x86.h b/src/layer/x86/gridsample_x86.h
index 826414eefc9..caf7c7c50c3 100644
--- a/src/layer/x86/gridsample_x86.h
+++ b/src/layer/x86/gridsample_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GridSample_x86 : virtual public GridSample
+class GridSample_x86 : public GridSample
 {
 public:
     GridSample_x86();
diff --git a/src/layer/x86/groupnorm_x86.h b/src/layer/x86/groupnorm_x86.h
index c3085e3622e..151884e5455 100644
--- a/src/layer/x86/groupnorm_x86.h
+++ b/src/layer/x86/groupnorm_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GroupNorm_x86 : virtual public GroupNorm
+class GroupNorm_x86 : public GroupNorm
 {
 public:
     GroupNorm_x86();
diff --git a/src/layer/x86/hardsigmoid_x86.h b/src/layer/x86/hardsigmoid_x86.h
index b111608bb87..418a8dc941f 100644
--- a/src/layer/x86/hardsigmoid_x86.h
+++ b/src/layer/x86/hardsigmoid_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_x86 : virtual public HardSigmoid
+class HardSigmoid_x86 : public HardSigmoid
 {
 public:
     HardSigmoid_x86();
diff --git a/src/layer/x86/hardswish_x86.h b/src/layer/x86/hardswish_x86.h
index 37fd42a513c..4fe521ea47d 100644
--- a/src/layer/x86/hardswish_x86.h
+++ b/src/layer/x86/hardswish_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSwish_x86 : virtual public HardSwish
+class HardSwish_x86 : public HardSwish
 {
 public:
     HardSwish_x86();
diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp
index 67bf0cca548..dee07d1de64 100644
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -53,7 +53,7 @@ int InnerProduct_x86::create_pipeline(const Option& opt)
 {
     //     if (opt.use_packing_layout)
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
 
         ncnn::ParamDict pd;
 
@@ -80,10 +80,7 @@ int InnerProduct_x86::create_pipeline(const Option& opt)
 
     innerproduct_transform_kernel_sse(weight_data, weight_data_tm, num_input, num_output, opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -178,10 +175,7 @@ int InnerProduct_x86::create_pipeline_fp16s(const Option& opt)
 
     innerproduct_transform_kernel_fp16s_sse(weight_data, weight_data_tm, num_input, num_output, opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -287,10 +281,7 @@ int InnerProduct_x86::create_pipeline_int8_x86(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/innerproduct_x86.h b/src/layer/x86/innerproduct_x86.h
index 211131e6132..19da245f32f 100644
--- a/src/layer/x86/innerproduct_x86.h
+++ b/src/layer/x86/innerproduct_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_x86 : virtual public InnerProduct
+class InnerProduct_x86 : public InnerProduct
 {
 public:
     InnerProduct_x86();
diff --git a/src/layer/x86/interp_x86.h b/src/layer/x86/interp_x86.h
index 6f91b950ef5..46fcde6f221 100644
--- a/src/layer/x86/interp_x86.h
+++ b/src/layer/x86/interp_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_x86 : virtual public Interp
+class Interp_x86 : public Interp
 {
 public:
     Interp_x86();
diff --git a/src/layer/x86/layernorm_x86.h b/src/layer/x86/layernorm_x86.h
index 42eb551ed95..7e8ec05894c 100644
--- a/src/layer/x86/layernorm_x86.h
+++ b/src/layer/x86/layernorm_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LayerNorm_x86 : virtual public LayerNorm
+class LayerNorm_x86 : public LayerNorm
 {
 public:
     LayerNorm_x86();
diff --git a/src/layer/x86/lrn_x86.h b/src/layer/x86/lrn_x86.h
index 3fe791872c6..9aa85367cda 100644
--- a/src/layer/x86/lrn_x86.h
+++ b/src/layer/x86/lrn_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LRN_x86 : virtual public LRN
+class LRN_x86 : public LRN
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/x86/lstm_x86.cpp b/src/layer/x86/lstm_x86.cpp
index 6ba218e53d3..5d693648f44 100644
--- a/src/layer/x86/lstm_x86.cpp
+++ b/src/layer/x86/lstm_x86.cpp
@@ -182,12 +182,9 @@ int LSTM_x86::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_xc_data.release();
-        bias_c_data.release();
-        weight_hc_data.release();
-    }
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/lstm_x86.h b/src/layer/x86/lstm_x86.h
index cab7d7e32fa..1dc56d45e03 100644
--- a/src/layer/x86/lstm_x86.h
+++ b/src/layer/x86/lstm_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LSTM_x86 : virtual public LSTM
+class LSTM_x86 : public LSTM
 {
 public:
     LSTM_x86();
diff --git a/src/layer/x86/matmul_x86.cpp b/src/layer/x86/matmul_x86.cpp
index 2c829ea1848..d0afe81f76b 100644
--- a/src/layer/x86/matmul_x86.cpp
+++ b/src/layer/x86/matmul_x86.cpp
@@ -25,7 +25,7 @@ MatMul_x86::MatMul_x86()
 
 int MatMul_x86::create_pipeline(const Option& opt)
 {
-    gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+    gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
     ncnn::ParamDict pd;
     pd.set(2, 0);      // transA
diff --git a/src/layer/x86/matmul_x86.h b/src/layer/x86/matmul_x86.h
index 12311e7a94d..afbb85a7883 100644
--- a/src/layer/x86/matmul_x86.h
+++ b/src/layer/x86/matmul_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MatMul_x86 : virtual public MatMul
+class MatMul_x86 : public MatMul
 {
 public:
     MatMul_x86();
diff --git a/src/layer/x86/mish_x86.cpp b/src/layer/x86/mish_x86.cpp
index e55a5e1f808..90ce135c19a 100644
--- a/src/layer/x86/mish_x86.cpp
+++ b/src/layer/x86/mish_x86.cpp
@@ -31,64 +31,8 @@ int Mish_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     int h = bottom_top_blob.h;
     int d = bottom_top_blob.d;
     int channels = bottom_top_blob.c;
-    int size = w * h * d;
-#if __SSE2__
     int elempack = bottom_top_blob.elempack;
-
-#if __AVX__
-#if __AVX512F__
-    if (elempack == 16)
-    {
-        Mat tmp;
-        convert_packing(bottom_top_blob, tmp, 8, opt);
-
-        forward_inplace(tmp, opt);
-
-        convert_packing(tmp, bottom_top_blob, 16, opt);
-
-        return 0;
-    }
-#endif // __AVX512F__
-
-    if (elempack == 8)
-    {
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
-        {
-            float* ptr = bottom_top_blob.channel(q);
-
-            for (int i = 0; i < size; i++)
-            {
-                __m256 _p = _mm256_loadu_ps(ptr);
-                _p = mish_avx(_p);
-                _mm256_storeu_ps(ptr, _p);
-                ptr += 8;
-            }
-        }
-
-        return 0;
-    }
-#endif // __AVX__
-
-    if (elempack == 4)
-    {
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
-        {
-            float* ptr = bottom_top_blob.channel(q);
-
-            for (int i = 0; i < size; i++)
-            {
-                __m128 _p = _mm_loadu_ps(ptr);
-                _p = mish_sse(_p);
-                _mm_storeu_ps(ptr, _p);
-                ptr += 4;
-            }
-        }
-
-        return 0;
-    }
-#endif // __SSE2__
+    int size = w * h * d * elempack;
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int q = 0; q < channels; q++)
@@ -98,6 +42,15 @@ int Mish_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int i = 0;
 #if __SSE2__
 #if __AVX__
+#if __AVX512F__
+        for (; i + 15 < size; i += 16)
+        {
+            __m512 _p = _mm512_loadu_ps(ptr);
+            _p = mish_avx512(_p);
+            _mm512_storeu_ps(ptr, _p);
+            ptr += 16;
+        }
+#endif
         for (; i + 7 < size; i += 8)
         {
             __m256 _p = _mm256_loadu_ps(ptr);
diff --git a/src/layer/x86/mish_x86.h b/src/layer/x86/mish_x86.h
index fe625e2ca37..dce8823c6f5 100644
--- a/src/layer/x86/mish_x86.h
+++ b/src/layer/x86/mish_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_x86 : virtual public Mish
+class Mish_x86 : public Mish
 {
 public:
     Mish_x86();
diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp
index 98397437c9d..2bddad5582d 100644
--- a/src/layer/x86/multiheadattention_x86.cpp
+++ b/src/layer/x86/multiheadattention_x86.cpp
@@ -42,7 +42,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         const int embed_dim_per_head = embed_dim / num_heads;
         const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
 
-        q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        q_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(0, inv_sqrt_embed_dim_per_head);
         pd.set(1, 1.f);
@@ -65,15 +65,12 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         q_gemm->load_model(ModelBinFromMatArray(weights));
         q_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            q_weight_data.release();
-            q_bias_data.release();
-        }
+        q_weight_data.release();
+        q_bias_data.release();
     }
 
     {
-        k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        k_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
@@ -94,15 +91,12 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         k_gemm->load_model(ModelBinFromMatArray(weights));
         k_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            k_weight_data.release();
-            k_bias_data.release();
-        }
+        k_weight_data.release();
+        k_bias_data.release();
     }
 
     {
-        v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        v_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
@@ -123,15 +117,12 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         v_gemm->load_model(ModelBinFromMatArray(weights));
         v_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            v_weight_data.release();
-            v_bias_data.release();
-        }
+        v_weight_data.release();
+        v_bias_data.release();
     }
 
     {
-        qk_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        qk_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 1);                   // transA
         pd.set(3, 0);                   // transB
@@ -151,7 +142,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         qk_gemm->create_pipeline(opt1);
     }
     {
-        qkv_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        qkv_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);   // transA
         pd.set(3, 1);   // transB
@@ -173,7 +164,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
     }
 
     {
-        qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+        qk_softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
         ncnn::ParamDict pd;
         pd.set(0, -1);
         pd.set(1, 1);
@@ -183,7 +174,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
     }
 
     {
-        o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        o_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 1);         // transA
         pd.set(3, 1);         // transB
@@ -202,11 +193,8 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         o_gemm->load_model(ModelBinFromMatArray(weights));
         o_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            out_weight_data.release();
-            out_bias_data.release();
-        }
+        out_weight_data.release();
+        out_bias_data.release();
     }
 
     return 0;
diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h
index a19a18001f5..55ea41780dd 100644
--- a/src/layer/x86/multiheadattention_x86.h
+++ b/src/layer/x86/multiheadattention_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MultiHeadAttention_x86 : virtual public MultiHeadAttention
+class MultiHeadAttention_x86 : public MultiHeadAttention
 {
 public:
     MultiHeadAttention_x86();
diff --git a/src/layer/x86/packing_x86.h b/src/layer/x86/packing_x86.h
index a00e74a4411..9f8f368039d 100644
--- a/src/layer/x86/packing_x86.h
+++ b/src/layer/x86/packing_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_x86 : virtual public Packing
+class Packing_x86 : public Packing
 {
 public:
     Packing_x86();
diff --git a/src/layer/x86/padding_x86.h b/src/layer/x86/padding_x86.h
index f01a4a19757..8772fe30eed 100644
--- a/src/layer/x86/padding_x86.h
+++ b/src/layer/x86/padding_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_x86 : virtual public Padding
+class Padding_x86 : public Padding
 {
 public:
     Padding_x86();
diff --git a/src/layer/x86/pooling_x86.h b/src/layer/x86/pooling_x86.h
index b79685c1840..030964fcb4d 100644
--- a/src/layer/x86/pooling_x86.h
+++ b/src/layer/x86/pooling_x86.h
@@ -22,7 +22,7 @@
 
 namespace ncnn {
 
-class Pooling_x86 : virtual public Pooling
+class Pooling_x86 : public Pooling
 {
 public:
     Pooling_x86();
diff --git a/src/layer/x86/prelu_x86.h b/src/layer/x86/prelu_x86.h
index 6bbfeae0f0d..17d60d4b297 100644
--- a/src/layer/x86/prelu_x86.h
+++ b/src/layer/x86/prelu_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_x86 : virtual public PReLU
+class PReLU_x86 : public PReLU
 {
 public:
     PReLU_x86();
diff --git a/src/layer/x86/quantize_x86.h b/src/layer/x86/quantize_x86.h
index 6fb2d41d662..5c743fe4cff 100644
--- a/src/layer/x86/quantize_x86.h
+++ b/src/layer/x86/quantize_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Quantize_x86 : virtual public Quantize
+class Quantize_x86 : public Quantize
 {
 public:
     Quantize_x86();
diff --git a/src/layer/x86/relu_x86.h b/src/layer/x86/relu_x86.h
index 6d3cce1c5d8..9d0b5966f53 100644
--- a/src/layer/x86/relu_x86.h
+++ b/src/layer/x86/relu_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_x86 : virtual public ReLU
+class ReLU_x86 : public ReLU
 {
 public:
     ReLU_x86();
diff --git a/src/layer/x86/requantize_x86.h b/src/layer/x86/requantize_x86.h
index 02b6880f0e9..febc418654f 100644
--- a/src/layer/x86/requantize_x86.h
+++ b/src/layer/x86/requantize_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Requantize_x86 : virtual public Requantize
+class Requantize_x86 : public Requantize
 {
 public:
     Requantize_x86();
diff --git a/src/layer/x86/reshape_x86.h b/src/layer/x86/reshape_x86.h
index a29b91c1b50..56c8ddfb357 100644
--- a/src/layer/x86/reshape_x86.h
+++ b/src/layer/x86/reshape_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Reshape_x86 : virtual public Reshape
+class Reshape_x86 : public Reshape
 {
 public:
     Reshape_x86();
diff --git a/src/layer/x86/roialign_x86.h b/src/layer/x86/roialign_x86.h
index f1c4ff912b3..1b91c1a8cbe 100644
--- a/src/layer/x86/roialign_x86.h
+++ b/src/layer/x86/roialign_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ROIAlign_x86 : virtual public ROIAlign
+class ROIAlign_x86 : public ROIAlign
 {
 public:
     ROIAlign_x86();
diff --git a/src/layer/x86/scale_x86.h b/src/layer/x86/scale_x86.h
index 840e6903c33..f06cf414688 100644
--- a/src/layer/x86/scale_x86.h
+++ b/src/layer/x86/scale_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Scale_x86 : virtual public Scale
+class Scale_x86 : public Scale
 {
 public:
     Scale_x86();
diff --git a/src/layer/x86/selu_x86.h b/src/layer/x86/selu_x86.h
index d7b5bf8a87e..7f4a78f80ed 100644
--- a/src/layer/x86/selu_x86.h
+++ b/src/layer/x86/selu_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class SELU_x86 : virtual public SELU
+class SELU_x86 : public SELU
 {
 public:
     SELU_x86();
diff --git a/src/layer/x86/shufflechannel_x86.h b/src/layer/x86/shufflechannel_x86.h
index 6adca483c17..1e4328a2560 100644
--- a/src/layer/x86/shufflechannel_x86.h
+++ b/src/layer/x86/shufflechannel_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ShuffleChannel_x86 : virtual public ShuffleChannel
+class ShuffleChannel_x86 : public ShuffleChannel
 {
 public:
     ShuffleChannel_x86();
diff --git a/src/layer/x86/sigmoid_x86.h b/src/layer/x86/sigmoid_x86.h
index 05ea2c40f11..52bf85d9eaf 100644
--- a/src/layer/x86/sigmoid_x86.h
+++ b/src/layer/x86/sigmoid_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_x86 : virtual public Sigmoid
+class Sigmoid_x86 : public Sigmoid
 {
 public:
     Sigmoid_x86();
diff --git a/src/layer/x86/slice_x86.h b/src/layer/x86/slice_x86.h
index fd6fbf9a1b7..0c9b266f84d 100644
--- a/src/layer/x86/slice_x86.h
+++ b/src/layer/x86/slice_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Slice_x86 : virtual public Slice
+class Slice_x86 : public Slice
 {
 public:
     Slice_x86();
diff --git a/src/layer/x86/softmax_x86.h b/src/layer/x86/softmax_x86.h
index c899dcd1cc8..3d1b733a9ec 100644
--- a/src/layer/x86/softmax_x86.h
+++ b/src/layer/x86/softmax_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_x86 : virtual public Softmax
+class Softmax_x86 : public Softmax
 {
 public:
     Softmax_x86();
diff --git a/src/layer/x86/swish_x86.h b/src/layer/x86/swish_x86.h
index 03c6d5e4b30..76b7c3d83f6 100644
--- a/src/layer/x86/swish_x86.h
+++ b/src/layer/x86/swish_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_x86 : virtual public Swish
+class Swish_x86 : public Swish
 {
 public:
     Swish_x86();
diff --git a/src/layer/x86/tanh_x86.h b/src/layer/x86/tanh_x86.h
index 60913d49c7b..e4c4477bc56 100644
--- a/src/layer/x86/tanh_x86.h
+++ b/src/layer/x86/tanh_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_x86 : virtual public TanH
+class TanH_x86 : public TanH
 {
 public:
     TanH_x86();
diff --git a/src/layer/x86/unaryop_x86.h b/src/layer/x86/unaryop_x86.h
index 8e8f6c4d2de..0e4a7ff59e1 100644
--- a/src/layer/x86/unaryop_x86.h
+++ b/src/layer/x86/unaryop_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_x86 : virtual public UnaryOp
+class UnaryOp_x86 : public UnaryOp
 {
 public:
     UnaryOp_x86();
diff --git a/src/layer/x86/yolov3detectionoutput_x86.h b/src/layer/x86/yolov3detectionoutput_x86.h
index ef93d4647f8..c378b5827b7 100644
--- a/src/layer/x86/yolov3detectionoutput_x86.h
+++ b/src/layer/x86/yolov3detectionoutput_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Yolov3DetectionOutput_x86 : virtual public Yolov3DetectionOutput
+class Yolov3DetectionOutput_x86 : public Yolov3DetectionOutput
 {
 public:
     Yolov3DetectionOutput_x86();
diff --git a/src/layer/yolodetectionoutput.cpp b/src/layer/yolodetectionoutput.cpp
index 9b9ba7dc289..1e0d86d73a4 100644
--- a/src/layer/yolodetectionoutput.cpp
+++ b/src/layer/yolodetectionoutput.cpp
@@ -38,7 +38,7 @@ int YoloDetectionOutput::load_param(const ParamDict& pd)
 int YoloDetectionOutput::create_pipeline(const Option& opt)
 {
     {
-        softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+        softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
 
         ncnn::ParamDict pd;
         pd.set(0, 0); // axis
diff --git a/src/layer/yolov3detectionoutput.cpp b/src/layer/yolov3detectionoutput.cpp
index 494fb6d186a..7528f5033cd 100644
--- a/src/layer/yolov3detectionoutput.cpp
+++ b/src/layer/yolov3detectionoutput.cpp
@@ -25,7 +25,7 @@ Yolov3DetectionOutput::Yolov3DetectionOutput()
     one_blob_only = false;
     support_inplace = false;
 
-    //softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+    //softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
 
     // set param
     ncnn::ParamDict pd;
diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in
index 52393b498e6..dfe8e73ce79 100644
--- a/src/layer_registry.h.in
+++ b/src/layer_registry.h.in
@@ -6,16 +6,22 @@ static const layer_registry_entry layer_registry[] = {
 @layer_registry@
 };
 
+static const layer_registry_entry layer_registry_arch[] = {
+@layer_registry_arch@
+};
+
 #if NCNN_RUNTIME_CPU && NCNN_AVX512
 static const layer_registry_entry layer_registry_avx512[] = {
 @layer_registry_avx512@
 };
 #endif // NCNN_RUNTIME_CPU && NCNN_AVX512
+
 #if NCNN_RUNTIME_CPU && NCNN_FMA
 static const layer_registry_entry layer_registry_fma[] = {
 @layer_registry_fma@
 };
 #endif // NCNN_RUNTIME_CPU && NCNN_FMA
+
 #if NCNN_RUNTIME_CPU && NCNN_AVX
 static const layer_registry_entry layer_registry_avx[] = {
 @layer_registry_avx@
@@ -45,3 +51,9 @@ static const layer_registry_entry layer_registry_rvv[] = {
 @layer_registry_rvv@
 };
 #endif // NCNN_RUNTIME_CPU && NCNN_RVV
+
+#if NCNN_VULKAN
+static const layer_registry_entry layer_registry_vulkan[] = {
+@layer_registry_vulkan@
+};
+#endif // NCNN_VULKAN
diff --git a/src/mat_pixel_resize.cpp b/src/mat_pixel_resize.cpp
index a559a7dac04..f28ce061bca 100644
--- a/src/mat_pixel_resize.cpp
+++ b/src/mat_pixel_resize.cpp
@@ -38,12 +38,12 @@ static void vresize_two(const short* rows0p, const short* rows1p, int wsize, uns
         int16x8_t _r01 = vld1q_s16(rows0p + 8);
         int16x8_t _r10 = vld1q_s16(rows1p);
         int16x8_t _r11 = vld1q_s16(rows1p + 8);
-        int16x8_t _acc00 = vaddq_s16(vqdmulhq_s16(_r00, _b0), vqdmulhq_s16(_r10, _b1));
-        int16x8_t _acc01 = vaddq_s16(vqdmulhq_s16(_r01, _b0), vqdmulhq_s16(_r11, _b1));
-        int16x8_t _acc10 = vaddq_s16(vqdmulhq_s16(_r00, _b2), vqdmulhq_s16(_r10, _b3));
-        int16x8_t _acc11 = vaddq_s16(vqdmulhq_s16(_r01, _b2), vqdmulhq_s16(_r11, _b3));
-        uint8x16_t _Dp0 = vcombine_u8(vqrshrun_n_s16(_acc00, 3), vqrshrun_n_s16(_acc01, 3));
-        uint8x16_t _Dp1 = vcombine_u8(vqrshrun_n_s16(_acc10, 3), vqrshrun_n_s16(_acc11, 3));
+        int16x8_t _acc00 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r00, _b0), 1), vqdmulhq_s16(_r10, _b1), 1);
+        int16x8_t _acc01 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r01, _b0), 1), vqdmulhq_s16(_r11, _b1), 1);
+        int16x8_t _acc10 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r00, _b2), 1), vqdmulhq_s16(_r10, _b3), 1);
+        int16x8_t _acc11 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r01, _b2), 1), vqdmulhq_s16(_r11, _b3), 1);
+        uint8x16_t _Dp0 = vcombine_u8(vqrshrun_n_s16(_acc00, 2), vqrshrun_n_s16(_acc01, 2));
+        uint8x16_t _Dp1 = vcombine_u8(vqrshrun_n_s16(_acc10, 2), vqrshrun_n_s16(_acc11, 2));
         vst1q_u8(Dp0, _Dp0);
         vst1q_u8(Dp1, _Dp1);
         Dp0 += 16;
@@ -55,10 +55,10 @@ static void vresize_two(const short* rows0p, const short* rows1p, int wsize, uns
     {
         int16x8_t _r0 = vld1q_s16(rows0p);
         int16x8_t _r1 = vld1q_s16(rows1p);
-        int16x8_t _acc0 = vaddq_s16(vqdmulhq_s16(_r0, _b0), vqdmulhq_s16(_r1, _b1));
-        int16x8_t _acc1 = vaddq_s16(vqdmulhq_s16(_r0, _b2), vqdmulhq_s16(_r1, _b3));
-        uint8x8_t _Dp0 = vqrshrun_n_s16(_acc0, 3);
-        uint8x8_t _Dp1 = vqrshrun_n_s16(_acc1, 3);
+        int16x8_t _acc0 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r0, _b0), 1), vqdmulhq_s16(_r1, _b1), 1);
+        int16x8_t _acc1 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r0, _b2), 1), vqdmulhq_s16(_r1, _b3), 1);
+        uint8x8_t _Dp0 = vqrshrun_n_s16(_acc0, 2);
+        uint8x8_t _Dp1 = vqrshrun_n_s16(_acc1, 2);
         vst1_u8(Dp0, _Dp0);
         vst1_u8(Dp1, _Dp1);
         Dp0 += 8;
@@ -136,9 +136,9 @@ static void vresize_one(const short* rows0p, const short* rows1p, int wsize, uns
         int16x8_t _r01 = vld1q_s16(rows0p + 8);
         int16x8_t _r10 = vld1q_s16(rows1p);
         int16x8_t _r11 = vld1q_s16(rows1p + 8);
-        int16x8_t _acc0 = vaddq_s16(vqdmulhq_s16(_r00, _b0), vqdmulhq_s16(_r10, _b1));
-        int16x8_t _acc1 = vaddq_s16(vqdmulhq_s16(_r01, _b0), vqdmulhq_s16(_r11, _b1));
-        uint8x16_t _Dp = vcombine_u8(vqrshrun_n_s16(_acc0, 3), vqrshrun_n_s16(_acc1, 3));
+        int16x8_t _acc0 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r00, _b0), 1), vqdmulhq_s16(_r10, _b1), 1);
+        int16x8_t _acc1 = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r01, _b0), 1), vqdmulhq_s16(_r11, _b1), 1);
+        uint8x16_t _Dp = vcombine_u8(vqrshrun_n_s16(_acc0, 2), vqrshrun_n_s16(_acc1, 2));
         vst1q_u8(Dp, _Dp);
         Dp += 16;
         rows0p += 16;
@@ -148,8 +148,8 @@ static void vresize_one(const short* rows0p, const short* rows1p, int wsize, uns
     {
         int16x8_t _r0 = vld1q_s16(rows0p);
         int16x8_t _r1 = vld1q_s16(rows1p);
-        int16x8_t _acc = vaddq_s16(vqdmulhq_s16(_r0, _b0), vqdmulhq_s16(_r1, _b1));
-        uint8x8_t _Dp = vqrshrun_n_s16(_acc, 3);
+        int16x8_t _acc = vsraq_n_s16(vshrq_n_s16(vqdmulhq_s16(_r0, _b0), 1), vqdmulhq_s16(_r1, _b1), 1);
+        uint8x8_t _Dp = vqrshrun_n_s16(_acc, 2);
         vst1_u8(Dp, _Dp);
         Dp += 8;
         rows0p += 8;
diff --git a/src/net.cpp b/src/net.cpp
index a7198d0a16e..ff2ab609137 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -126,6 +126,9 @@ static Option get_masked_option(const Option& opt, int featmask)
     opt1.use_sgemm_convolution = opt1.use_sgemm_convolution && !(featmask & (1 << 5));
     opt1.use_winograd_convolution = opt1.use_winograd_convolution && !(featmask & (1 << 6));
 
+    if (featmask & (1 << 7))
+        opt1.num_threads = 1;
+
     return opt1;
 }
 
@@ -145,6 +148,8 @@ int NetPrivate::upload_model()
     }
 
     Option opt_upload = opt;
+    opt_upload.blob_allocator = 0;
+    opt_upload.workspace_allocator = 0;
     opt_upload.blob_vkallocator = weight_vkallocator;
     opt_upload.workspace_vkallocator = weight_vkallocator;
     opt_upload.staging_vkallocator = weight_staging_vkallocator;
@@ -616,15 +621,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         // clang-format off
         // *INDENT-OFF*
 
-#if NCNN_ARM82
-        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && layer->support_fp16_storage)
+#if NCNN_VFPV4
+        if (opt.use_fp16_storage && cpu_support_arm_vfpv4() && layer->support_fp16_storage)
         {
             Mat bottom_blob_fp16;
             cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
             bottom_blob = bottom_blob_fp16;
         }
         else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
 #if NCNN_RVV
         if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && layer->support_fp16_storage)
         {
@@ -726,15 +731,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         // clang-format off
         // *INDENT-OFF*
 
-#if NCNN_ARM82
-        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && !layer->support_fp16_storage)
+#if NCNN_VFPV4
+        if (opt.use_fp16_storage && cpu_support_arm_vfpv4() && !layer->support_fp16_storage)
         {
             Mat bottom_blob_fp32;
             cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
             bottom_blob = bottom_blob_fp32;
         }
         else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
 #if NCNN_RVV
         if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && !layer->support_fp16_storage)
         {
@@ -1342,8 +1347,11 @@ int Net::load_param(const DataReader& dr)
         // sanitize use options
         if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
         if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
+        if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
         if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
+        if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
         if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
+        if (!d->vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
         if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
         if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
 
@@ -1354,6 +1362,9 @@ int Net::load_param(const DataReader& dr)
 
         // fp16a makes no sense when fp16 storage disabled
         if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
+
+        // fp16 uniform makes no sense when fp16 arithmetic disabled
+        if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
     }
     else
     {
@@ -1377,9 +1388,15 @@ int Net::load_param(const DataReader& dr)
         SCAN_VALUE("%d", top_count)
 
         Layer* layer = create_overwrite_builtin_layer(layer_type);
+#if NCNN_VULKAN
+        if (!layer && opt.use_vulkan_compute && d->vkdev)
+        {
+            layer = create_layer_vulkan(layer_type);
+        }
+#endif // NCNN_VULKAN
         if (!layer)
         {
-            layer = create_layer(layer_type);
+            layer = create_layer_cpu(layer_type);
         }
         if (!layer)
         {
@@ -1402,7 +1419,6 @@ int Net::load_param(const DataReader& dr)
         //         NCNN_LOGE("new layer %d %s", i, layer_name);
 
         layer->bottoms.resize(bottom_count);
-
         for (int j = 0; j < bottom_count; j++)
         {
             char bottom_name[256];
@@ -1446,20 +1462,16 @@ int Net::load_param(const DataReader& dr)
             blob_index++;
         }
 
+        int layer_support_vulkan = layer->support_vulkan;
+
         // layer specific params
         int pdlr = pd.load_param(dr);
         if (pdlr != 0)
         {
-            NCNN_LOGE("ParamDict load_param %d %s failed", i, layer->name.c_str());
+            NCNN_LOGE("ParamDict load_param %d %s failed", i, layer_name);
             continue;
         }
 
-        if (layer->support_int8_storage)
-        {
-            // no int8 gpu support yet
-            opt.use_vulkan_compute = false;
-        }
-
         // pull out top shape hints
         Mat shape_hints = pd.get(30, Mat());
         if (!shape_hints.empty())
@@ -1506,10 +1518,62 @@ int Net::load_param(const DataReader& dr)
         int lr = layer->load_param(pd);
         if (lr != 0)
         {
-            NCNN_LOGE("layer load_param %d %s failed", i, layer->name.c_str());
+            NCNN_LOGE("layer load_param %d %s failed", i, layer_name);
             continue;
         }
 
+        if (layer->support_int8_storage)
+        {
+            // no int8 gpu support yet
+            opt.use_vulkan_compute = false;
+        }
+
+        Option opt1 = get_masked_option(opt, layer->featmask);
+#if NCNN_VULKAN
+        if (opt1.use_vulkan_compute)
+        {
+            if (!layer->support_image_storage) opt1.use_image_storage = false;
+        }
+#endif // NCNN_VULKAN
+
+        if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute))
+        {
+            // vulkan layer cannot handle these param, recreate cpu layer
+            Layer* layer_cpu = create_overwrite_builtin_layer(layer_type);
+            if (!layer_cpu)
+            {
+                layer_cpu = create_layer_cpu(layer_type);
+            }
+            if (!layer_cpu)
+            {
+                layer_cpu = create_custom_layer(layer_type);
+            }
+            if (!layer_cpu)
+            {
+                NCNN_LOGE("layer %s not exists or registered", layer_type);
+                clear();
+                return -1;
+            }
+
+            layer_cpu->type = layer->type;
+            layer_cpu->name = layer->name;
+            layer_cpu->bottoms = layer->bottoms;
+            layer_cpu->tops = layer->tops;
+            layer_cpu->bottom_shapes = layer->bottom_shapes;
+            layer_cpu->top_shapes = layer->top_shapes;
+            layer_cpu->featmask = layer->featmask;
+
+            int lr = layer_cpu->load_param(pd);
+            if (lr != 0)
+            {
+                NCNN_LOGE("layer load_param %d %s failed", i, layer_name);
+                continue;
+            }
+
+            delete layer;
+            layer = layer_cpu;
+        }
+
         d->layers[i] = layer;
     }
 
@@ -1579,8 +1643,11 @@ int Net::load_param_bin(const DataReader& dr)
         // sanitize use options
         if (!d->vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
         if (!d->vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
+        if (!d->vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
         if (!d->vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
+        if (!d->vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
         if (!d->vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
+        if (!d->vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
         if (!d->vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
         if (!d->vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
 
@@ -1591,6 +1658,9 @@ int Net::load_param_bin(const DataReader& dr)
 
         // fp16a makes no sense when fp16 storage disabled
         if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
+
+        // fp16 uniform makes no sense when fp16 arithmetic disabled
+        if (!opt.use_fp16_arithmetic) opt.use_fp16_uniform = false;
     }
     else
     {
@@ -1611,9 +1681,15 @@ int Net::load_param_bin(const DataReader& dr)
         READ_VALUE(top_count)
 
         Layer* layer = create_overwrite_builtin_layer(typeindex);
+#if NCNN_VULKAN
+        if (!layer && opt.use_vulkan_compute && d->vkdev)
+        {
+            layer = create_layer_vulkan(typeindex);
+        }
+#endif // NCNN_VULKAN
         if (!layer)
         {
-            layer = create_layer(typeindex);
+            layer = create_layer_cpu(typeindex);
         }
         if (!layer)
         {
@@ -1665,24 +1741,16 @@ int Net::load_param_bin(const DataReader& dr)
             layer->tops[j] = top_blob_index;
         }
 
+        int layer_support_vulkan = layer->support_vulkan;
+
         // layer specific params
         int pdlr = pd.load_param_bin(dr);
         if (pdlr != 0)
         {
-#if NCNN_STRING
-            NCNN_LOGE("ParamDict load_param %d %s failed", i, layer->name.c_str());
-#else
-            NCNN_LOGE("ParamDict load_param %d failed", i);
-#endif
+            NCNN_LOGE("ParamDict load_param_bin %d failed", i);
             continue;
         }
 
-        if (layer->support_int8_storage)
-        {
-            // no int8 gpu support yet
-            opt.use_vulkan_compute = false;
-        }
-
         // pull out top blob shape hints
         Mat shape_hints = pd.get(30, Mat());
         if (!shape_hints.empty())
@@ -1729,14 +1797,61 @@ int Net::load_param_bin(const DataReader& dr)
         int lr = layer->load_param(pd);
         if (lr != 0)
         {
-#if NCNN_STRING
-            NCNN_LOGE("layer load_param %d %s failed", i, layer->name.c_str());
-#else
             NCNN_LOGE("layer load_param %d failed", i);
-#endif
             continue;
         }
 
+        if (layer->support_int8_storage)
+        {
+            // no int8 gpu support yet
+            opt.use_vulkan_compute = false;
+        }
+
+        Option opt1 = get_masked_option(opt, layer->featmask);
+#if NCNN_VULKAN
+        if (opt1.use_vulkan_compute)
+        {
+            if (!layer->support_image_storage) opt1.use_image_storage = false;
+        }
+#endif // NCNN_VULKAN
+
+        if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute))
+        {
+            // vulkan layer cannot handle these param, recreate cpu layer
+            Layer* layer_cpu = create_overwrite_builtin_layer(typeindex);
+            if (!layer_cpu)
+            {
+                layer_cpu = create_layer_cpu(typeindex);
+            }
+            if (!layer_cpu)
+            {
+                int custom_index = typeindex & ~LayerType::CustomBit;
+                layer_cpu = create_custom_layer(custom_index);
+            }
+            if (!layer_cpu)
+            {
+                NCNN_LOGE("layer %d not exists or registered", typeindex);
+                clear();
+                return -1;
+            }
+
+            layer_cpu->bottoms = layer->bottoms;
+            layer_cpu->tops = layer->tops;
+            layer_cpu->bottom_shapes = layer->bottom_shapes;
+            layer_cpu->top_shapes = layer->top_shapes;
+            layer_cpu->featmask = layer->featmask;
+
+            int lr = layer_cpu->load_param(pd);
+            if (lr != 0)
+            {
+                NCNN_LOGE("layer load_param %d failed", i);
+                continue;
+            }
+
+            delete layer;
+            layer = layer_cpu;
+        }
+
         d->layers[i] = layer;
     }
 
@@ -1796,24 +1911,7 @@ int Net::load_model(const DataReader& dr)
             break;
         }
 
-        if (layer->support_int8_storage)
-        {
-            // no int8 gpu support yet
-            opt.use_vulkan_compute = false;
-        }
-
         Option opt1 = get_masked_option(opt, layer->featmask);
-#if NCNN_VULKAN
-        if (opt1.use_vulkan_compute)
-        {
-            if (!layer->support_image_storage) opt1.use_image_storage = false;
-        }
-        else
-        {
-            layer->vkdev = 0;
-            layer->support_vulkan = false;
-        }
-#endif // NCNN_VULKAN
 
         int cret = layer->create_pipeline(opt1);
         if (cret != 0)
@@ -2378,7 +2476,8 @@ void Extractor::set_light_mode(bool enable)
 
 void Extractor::set_num_threads(int num_threads)
 {
-    d->opt.num_threads = num_threads;
+    NCNN_LOGE("ex.set_num_threads() is no-op, please set net.opt.num_threads=N before net.load_param()");
+    NCNN_LOGE("If you want to use single thread for only some layer, see https://github.com/Tencent/ncnn/wiki/layer-feat-mask");
 }
 
 void Extractor::set_blob_allocator(Allocator* allocator)
@@ -2394,14 +2493,8 @@ void Extractor::set_workspace_allocator(Allocator* allocator)
 #if NCNN_VULKAN
 void Extractor::set_vulkan_compute(bool enable)
 {
-    if (d->net->d->opt.use_vulkan_compute)
-    {
-        d->opt.use_vulkan_compute = enable;
-    }
-    else
-    {
-        NCNN_LOGE("set_vulkan_compute failed, network use_vulkan_compute disabled");
-    }
+    NCNN_LOGE("ex.set_vulkan_compute() is no-op, please set net.opt.use_vulkan_compute=true/false before net.load_param()");
+    NCNN_LOGE("If you want to disable vulkan for only some layer, see https://github.com/Tencent/ncnn/wiki/layer-feat-mask");
 }
 
 void Extractor::set_blob_vkallocator(VkAllocator* allocator)
@@ -2598,8 +2691,8 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
 
     // clang-format off
     // *INDENT-OFF*
-#if NCNN_ARM82
-    if (d->opt.use_fp16_storage && cpu_support_arm_asimdhp() && (type == 0))
+#if NCNN_VFPV4
+    if (d->opt.use_fp16_storage && cpu_support_arm_vfpv4() && (type == 0))
     {
         if (feat.elembits() == 16)
         {
@@ -2609,7 +2702,7 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
         }
     }
     else
-#endif // NCNN_ARM82
+#endif // NCNN_VFPV4
 #if NCNN_BF16
     if (d->opt.use_bf16_storage && (type == 0))
     {
diff --git a/src/net.h b/src/net.h
index 98e3ec335f1..d69443bbd3c 100644
--- a/src/net.h
+++ b/src/net.h
@@ -182,9 +182,8 @@ class NCNN_EXPORT Extractor
     // enabled by default
     void set_light_mode(bool enable);
 
-    // set thread count for this extractor
-    // this will overwrite the global setting
-    // default count is system depended
+    // deprecated, no-op
+    // instead, set net.opt.num_threads before net.load_param()
     void set_num_threads(int num_threads);
 
     // set blob memory allocator
@@ -194,6 +193,8 @@ class NCNN_EXPORT Extractor
     void set_workspace_allocator(Allocator* allocator);
 
 #if NCNN_VULKAN
+    // deprecated, no-op
+    // instead, set net.opt.use_vulkan_compute before net.load_param()
     void set_vulkan_compute(bool enable);
 
     void set_blob_vkallocator(VkAllocator* allocator);
diff --git a/src/option.cpp b/src/option.cpp
index ea2dd6d25c8..a30dabe55f8 100644
--- a/src/option.cpp
+++ b/src/option.cpp
@@ -74,6 +74,9 @@ Option::Option()
     use_winograd63_convolution = true;
 
     use_a53_a55_optimized_kernel = is_current_thread_running_on_a53_a55();
+
+    use_fp16_uniform = true;
+    use_int8_uniform = true;
 }
 
 } // namespace ncnn
diff --git a/src/option.h b/src/option.h
index 7d0cc60ba7d..eb2a5a7d342 100644
--- a/src/option.h
+++ b/src/option.h
@@ -144,8 +144,10 @@ class NCNN_EXPORT Option
     // but you can force this on/off if you wish
     bool use_a53_a55_optimized_kernel;
 
-    bool use_reserved_7;
-    bool use_reserved_8;
+    // enable options for shared variables in gpu shader
+    bool use_fp16_uniform;
+    bool use_int8_uniform;
+
     bool use_reserved_9;
     bool use_reserved_10;
     bool use_reserved_11;
diff --git a/src/ruapu.h b/src/ruapu.h
new file mode 100644
index 00000000000..ff3c19e2c46
--- /dev/null
+++ b/src/ruapu.h
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024 nihui (https://github.com/nihui)
+// Copyright (c) 2024 kernelbin (https://github.com/kernelbin)
+//
+// ruapu --- detect cpu isa features with single-file
+
+#ifndef RUAPU_H
+#define RUAPU_H
+
+void ruapu_init();
+
+int ruapu_supports(const char* isa);
+
+#ifdef RUAPU_IMPLEMENTATION
+
+#include <setjmp.h>
+#include <string.h>
+
+#if defined _WIN32
+
+#include <windows.h>
+
+#if WINAPI_FAMILY == WINAPI_FAMILY_APP
+static int ruapu_detect_isa(const void* some_inst)
+{
+    // uwp does not support seh  :(
+    (void)some_inst;
+    return 0;
+}
+#else // WINAPI_FAMILY == WINAPI_FAMILY_APP
+static int g_ruapu_sigill_caught = 0;
+static jmp_buf g_ruapu_jmpbuf;
+
+typedef const void* ruapu_some_inst;
+
+static LONG CALLBACK ruapu_catch_sigill(struct _EXCEPTION_POINTERS* ExceptionInfo)
+{
+    if (ExceptionInfo->ExceptionRecord->ExceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION)
+    {
+        g_ruapu_sigill_caught = 1;
+        longjmp(g_ruapu_jmpbuf, -1);
+    }
+
+    return EXCEPTION_CONTINUE_SEARCH;
+}
+
+static int ruapu_detect_isa(const void* some_inst)
+{
+    g_ruapu_sigill_caught = 0;
+
+    PVOID eh = AddVectoredExceptionHandler(1, ruapu_catch_sigill);
+
+    if (setjmp(g_ruapu_jmpbuf) == 0)
+    {
+        ((void (*)())some_inst)();
+    }
+
+    RemoveVectoredExceptionHandler(eh);
+
+    return g_ruapu_sigill_caught ? 0 : 1;
+}
+#endif // WINAPI_FAMILY == WINAPI_FAMILY_APP
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+#ifdef _MSC_VER
+#define RUAPU_INSTCODE(isa, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned char ruapu_some_##isa[] = { __VA_ARGS__, 0xc3 };
+#else
+#define RUAPU_INSTCODE(isa, ...) __attribute__((section(".text"))) static unsigned char ruapu_some_##isa[] = { __VA_ARGS__, 0xc3 };
+#endif
+
+#elif __aarch64__ || defined(_M_ARM64)
+#ifdef _MSC_VER
+#define RUAPU_INSTCODE(isa, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0xd65f03c0 };
+#else
+#define RUAPU_INSTCODE(isa, ...) __attribute__((section(".text"))) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0xd65f03c0 };
+#endif
+
+#elif __arm__ || defined(_M_ARM)
+#if __thumb__
+#ifdef _MSC_VER
+#define RUAPU_INSTCODE(isa, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0x4770 };
+#else
+#define RUAPU_INSTCODE(isa, ...) __attribute__((section(".text"))) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0x4770 };
+#endif
+#else
+#ifdef _MSC_VER
+#define RUAPU_INSTCODE(isa, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0xe12fff1e };
+#else
+#define RUAPU_INSTCODE(isa, ...) __attribute__((section(".text"))) static unsigned int ruapu_some_##isa[] = { __VA_ARGS__, 0xe12fff1e };
+#endif
+#endif
+
+#endif
+
+#elif defined __ANDROID__ || defined __linux__ || defined __APPLE__
+#include <signal.h>
+
+static int g_ruapu_sigill_caught = 0;
+static sigjmp_buf g_ruapu_jmpbuf;
+
+typedef void (*ruapu_some_inst)();
+
+static void ruapu_catch_sigill(int signo, siginfo_t* si, void* data)
+{
+    (void)signo;
+    (void)si;
+    (void)data;
+
+    g_ruapu_sigill_caught = 1;
+    siglongjmp(g_ruapu_jmpbuf, -1);
+}
+
+static int ruapu_detect_isa(ruapu_some_inst some_inst)
+{
+    g_ruapu_sigill_caught = 0;
+
+    struct sigaction sa = { 0 };
+    struct sigaction old_sa;
+    sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO;
+    sa.sa_sigaction = ruapu_catch_sigill;
+    sigaction(SIGILL, &sa, &old_sa);
+
+    if (sigsetjmp(g_ruapu_jmpbuf, 1) == 0)
+    {
+        some_inst();
+    }
+
+    sigaction(SIGILL, &old_sa, NULL);
+
+    return g_ruapu_sigill_caught ? 0 : 1;
+}
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+#define RUAPU_INSTCODE(isa, ...) static void ruapu_some_##isa() { asm volatile(".byte " #__VA_ARGS__ : : : ); }
+#elif __aarch64__
+#define RUAPU_INSTCODE(isa, ...) static void ruapu_some_##isa() { asm volatile(".word " #__VA_ARGS__ : : : ); }
+#elif __arm__
+#define RUAPU_INSTCODE(isa, ...) static void ruapu_some_##isa() { asm volatile(".word " #__VA_ARGS__ : : : ); }
+#endif
+
+#else // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
+typedef const void* ruapu_some_inst;
+static int ruapu_detect_isa(const void* some_inst)
+{
+    // unknown platform, bare metal os ?
+    (void)some_inst;
+    return 0;
+}
+
+#define RUAPU_INSTCODE(isa, ...) static void ruapu_some_##isa() { }
+#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
+
+struct ruapu_isa_entry
+{
+    const char* isa;
+    ruapu_some_inst inst;
+    int capable;
+};
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+RUAPU_INSTCODE(mmx, 0x0f, 0xdb, 0xc0) // pand mm0,mm0
+RUAPU_INSTCODE(sse, 0x0f, 0x54, 0xc0) // andps xmm0,xmm0
+RUAPU_INSTCODE(sse2, 0x66, 0x0f, 0xfe, 0xc0) // paddd xmm0,xmm0
+RUAPU_INSTCODE(sse3, 0xf2, 0x0f, 0x7c, 0xc0) // haddps xmm0,xmm0
+RUAPU_INSTCODE(ssse3, 0x66, 0x0f, 0x38, 0x06, 0xc0) // phsubd xmm0,xmm0
+RUAPU_INSTCODE(sse41, 0x66, 0x0f, 0x38, 0x3d, 0xc0) // pmaxsd xmm0,xmm0
+RUAPU_INSTCODE(sse42, 0x66, 0x0f, 0x38, 0x37, 0xc0) // pcmpgtq xmm0,xmm0
+RUAPU_INSTCODE(sse4a, 0x66, 0x0f, 0x79, 0xc0) // extrq xmm0,xmm0
+RUAPU_INSTCODE(xop, 0x8f, 0xe8, 0x78, 0xb6, 0xc0, 0x00)  // vpmadcswd xmm0,xmm0,xmm0,xmm0
+RUAPU_INSTCODE(avx, 0xc5, 0xfc, 0x54, 0xc0) // vandps ymm0,ymm0,ymm0
+RUAPU_INSTCODE(f16c, 0xc4, 0xe2, 0x7d, 0x13, 0xc0) // vcvtph2ps ymm0,xmm0
+RUAPU_INSTCODE(fma, 0xc4, 0xe2, 0x7d, 0x98, 0xc0) // vfmadd132ps ymm0,ymm0,ymm0
+RUAPU_INSTCODE(fma4, 0xc4, 0xe3, 0xfd, 0x68, 0xc0, 0x00) // vfmaddps ymm0,ymm0,ymm0,ymm0
+RUAPU_INSTCODE(avx2, 0xc5, 0xfd, 0xfe, 0xc0) // vpaddd ymm0,ymm0,ymm0
+RUAPU_INSTCODE(avx512f, 0x62, 0xf1, 0x7c, 0x48, 0x58, 0xc0) // vaddps zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512bw, 0x62, 0xf1, 0x7d, 0x48, 0xfd, 0xc0) // vpaddw zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512cd, 0x62, 0xf2, 0xfd, 0x48, 0x44, 0xc0) // vplzcntq zmm0,zmm0
+RUAPU_INSTCODE(avx512dq, 0x62, 0xf1, 0x7c, 0x48, 0x54, 0xc0) // vandps zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512vl, 0x62, 0xf2, 0xfd, 0x28, 0x1f, 0xc0) // vpabsq ymm0,ymm0
+RUAPU_INSTCODE(avx512vnni, 0x62, 0xf2, 0x7d, 0x48, 0x52, 0xc0) // vpdpwssd zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512bf16, 0x62, 0xf2, 0x7e, 0x48, 0x52, 0xc0) // vdpbf16ps zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512ifma, 0x62, 0xf2, 0xfd, 0x48, 0xb4, 0xc0) // vpmadd52luq zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512vbmi, 0x62, 0xf2, 0x7d, 0x48, 0x75, 0xc0) // vpermi2b zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512vbmi2, 0x62, 0xf2, 0x7d, 0x48, 0x71, 0xc0) // vpshldvd zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avx512fp16, 0x62, 0xf6, 0x7d, 0x48, 0x98, 0xc0) // vfmadd132ph zmm0,zmm0,zmm0
+RUAPU_INSTCODE(avxvnni, 0xc4, 0xe2, 0x7d, 0x52, 0xc0) // vpdpwssd ymm0,ymm0,ymm0
+RUAPU_INSTCODE(avxvnniint8, 0xc4, 0xe2, 0x7f, 0x50, 0xc0) // vpdpbssd ymm0,ymm0,ymm0
+RUAPU_INSTCODE(avxifma, 0xc4, 0xe2, 0xfd, 0xb4, 0xc0) // vpmadd52luq ymm0,ymm0,ymm0
+
+#elif __aarch64__ || defined(_M_ARM64)
+RUAPU_INSTCODE(neon, 0x4e20d400) // fadd v0.4s,v0.4s,v0.4s
+RUAPU_INSTCODE(vfpv4, 0x0e216800) // fcvtn v0.4h,v0.4s
+RUAPU_INSTCODE(cpuid, 0xd5380000) // mrs x0,midr_el1
+RUAPU_INSTCODE(asimdhp, 0x0e401400) // fadd v0.4h,v0.4h,v0.4h
+RUAPU_INSTCODE(asimddp, 0x4e809400) // sdot v0.4h,v0.16b,v0.16b
+RUAPU_INSTCODE(asimdfhm, 0x4e20ec00) // fmlal v0.4s,v0.4h,v0.4h
+RUAPU_INSTCODE(bf16, 0x6e40ec00) // bfmmla v0.4h,v0.8h,v0.8h
+RUAPU_INSTCODE(i8mm, 0x4e80a400) // smmla v0.4h,v0.16b,v0.16b
+RUAPU_INSTCODE(sve, 0x65608000) // fmad z0.h,p0/m,z0.h,z0.h
+RUAPU_INSTCODE(sve2, 0x44405000) // smlslb z0.h,z0.b,z0.b
+RUAPU_INSTCODE(svebf16, 0x6460e400) // bfmmla z0.s,z0.h,z0.h
+RUAPU_INSTCODE(svei8mm, 0x45009800) // smmla z0.s,z0.b,z0.b
+RUAPU_INSTCODE(svef32mm, 0x64a0e400) // fmmla z0.s,z0.s,z0.s
+
+#elif __arm__ || defined(_M_ARM)
+#if __thumb__
+RUAPU_INSTCODE(edsp, 0xfb20, 0x0000) // smlad r0,r0,r0,r0
+RUAPU_INSTCODE(neon, 0xef00, 0x0d40) // vadd.f32 q0,q0,q0
+RUAPU_INSTCODE(vfpv4, 0xffb6, 0x0600) // vcvt.f16.f32 d0,q0
+#else
+RUAPU_INSTCODE(edsp, 0xe7000010) // smlad r0,r0,r0,r0
+RUAPU_INSTCODE(neon, 0xf2000d40) // vadd.f32 q0,q0,q0
+RUAPU_INSTCODE(vfpv4, 0xf3b60600) // vcvt.f16.f32 d0,q0
+#endif
+
+#endif
+
+#undef RUAPU_INSTCODE
+
+#define RUAPU_ISAENTRY(isa) { #isa, (ruapu_some_inst)ruapu_some_##isa, 0 },
+
+struct ruapu_isa_entry g_ruapu_isa_map[] = {
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+RUAPU_ISAENTRY(mmx)
+RUAPU_ISAENTRY(sse)
+RUAPU_ISAENTRY(sse2)
+RUAPU_ISAENTRY(sse3)
+RUAPU_ISAENTRY(ssse3)
+RUAPU_ISAENTRY(sse41)
+RUAPU_ISAENTRY(sse42)
+RUAPU_ISAENTRY(sse4a)
+RUAPU_ISAENTRY(xop)
+RUAPU_ISAENTRY(avx)
+RUAPU_ISAENTRY(f16c)
+RUAPU_ISAENTRY(fma)
+RUAPU_ISAENTRY(fma4)
+RUAPU_ISAENTRY(avx2)
+RUAPU_ISAENTRY(avx512f)
+RUAPU_ISAENTRY(avx512bw)
+RUAPU_ISAENTRY(avx512cd)
+RUAPU_ISAENTRY(avx512dq)
+RUAPU_ISAENTRY(avx512vl)
+RUAPU_ISAENTRY(avx512vnni)
+RUAPU_ISAENTRY(avx512bf16)
+RUAPU_ISAENTRY(avx512ifma)
+RUAPU_ISAENTRY(avx512vbmi)
+RUAPU_ISAENTRY(avx512vbmi2)
+RUAPU_ISAENTRY(avx512fp16)
+RUAPU_ISAENTRY(avxvnni)
+RUAPU_ISAENTRY(avxvnniint8)
+RUAPU_ISAENTRY(avxifma)
+
+#elif __aarch64__ || defined(_M_ARM64)
+RUAPU_ISAENTRY(neon)
+RUAPU_ISAENTRY(vfpv4)
+RUAPU_ISAENTRY(cpuid)
+RUAPU_ISAENTRY(asimdhp)
+RUAPU_ISAENTRY(asimddp)
+RUAPU_ISAENTRY(asimdfhm)
+RUAPU_ISAENTRY(bf16)
+RUAPU_ISAENTRY(i8mm)
+RUAPU_ISAENTRY(sve)
+RUAPU_ISAENTRY(sve2)
+RUAPU_ISAENTRY(svebf16)
+RUAPU_ISAENTRY(svei8mm)
+RUAPU_ISAENTRY(svef32mm)
+
+#elif __arm__ || defined(_M_ARM)
+RUAPU_ISAENTRY(edsp)
+RUAPU_ISAENTRY(neon)
+RUAPU_ISAENTRY(vfpv4)
+
+#endif
+};
+
+#undef RUAPU_ISAENTRY
+
+void ruapu_init()
+{
+    for (size_t i = 0; i < sizeof(g_ruapu_isa_map) / sizeof(g_ruapu_isa_map[0]); i++)
+    {
+        g_ruapu_isa_map[i].capable = ruapu_detect_isa(g_ruapu_isa_map[i].inst);
+    }
+}
+
+int ruapu_supports(const char* isa)
+{
+    for (size_t i = 0; i < sizeof(g_ruapu_isa_map) / sizeof(g_ruapu_isa_map[0]); i++)
+    {
+        if (strcmp(g_ruapu_isa_map[i].isa, isa) == 0)
+        {
+            return g_ruapu_isa_map[i].capable;
+        }
+    }
+
+    return 0;
+}
+
+#endif // RUAPU_IMPLEMENTATION
+
+#endif // RUAPU_H
diff --git a/src/simplevk.cpp b/src/simplevk.cpp
index b4d1d778da8..4cd23b3c81b 100644
--- a/src/simplevk.cpp
+++ b/src/simplevk.cpp
@@ -316,6 +316,12 @@ static int load_vulkan_linux(const char* driver_path)
 #endif
 
     void* libvulkan = dlopen(libpath, RTLD_LOCAL | RTLD_NOW);
+#if !__APPLE__
+    if (!libvulkan)
+    {
+        libvulkan = dlopen("libvulkan.so.1", RTLD_LOCAL | RTLD_NOW);
+    }
+#endif
     if (!libvulkan)
     {
         NCNN_LOGE("dlopen failed %s", dlerror());
diff --git a/src/stb_image.h b/src/stb_image.h
index 1b4b337328e..1c2096a3a0d 100644
--- a/src/stb_image.h
+++ b/src/stb_image.h
@@ -786,6 +786,20 @@ static int stbi__sse2_available(void)
 #endif
 #endif
 
+// RISC-V VECTOR
+#if defined(STBI_NO_SIMD) && defined(STBI_RVV)
+#undef STBI_RVV
+#endif
+
+#ifdef STBI_RVV
+#include <riscv_vector.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
 #ifndef STBI_SIMD_ALIGN
 #define STBI_SIMD_ALIGN(type, name) type name
 #endif
@@ -2910,6 +2924,180 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 
 #endif // STBI_NEON
 
+#ifdef STBI_RVV
+
+// risc-v vector integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   const short rot0_0 = stbi__f2f(0.5411961f);
+   const short rot0_1 = stbi__f2f(-1.847759065f);
+   const short rot0_2 = stbi__f2f( 0.765366865f);
+   const short rot1_0 = stbi__f2f( 1.175875602f);
+   const short rot1_1 = stbi__f2f(-0.899976223f);
+   const short rot1_2 = stbi__f2f(-2.562915447f);
+   const short rot2_0 = stbi__f2f(-1.961570560f);
+   const short rot2_1 = stbi__f2f(-0.390180644f);
+   const short rot3_0 = stbi__f2f( 0.298631336f);
+   const short rot3_1 = stbi__f2f( 2.053119869f);
+   const short rot3_2 = stbi__f2f( 3.072711026f);
+   const short rot3_3 = stbi__f2f( 1.501321110f);
+
+   // scratch buffer for data transpose
+   short tmp[64];
+
+   const size_t vl = vsetvl_e16m1(8);
+
+   // column pass
+   {
+      vint16m1_t row0 = vle16_v_i16m1(data + 0*8, vl);
+      vint16m1_t row1 = vle16_v_i16m1(data + 1*8, vl);
+      vint16m1_t row2 = vle16_v_i16m1(data + 2*8, vl);
+      vint16m1_t row3 = vle16_v_i16m1(data + 3*8, vl);
+      vint16m1_t row4 = vle16_v_i16m1(data + 4*8, vl);
+      vint16m1_t row5 = vle16_v_i16m1(data + 5*8, vl);
+      vint16m1_t row6 = vle16_v_i16m1(data + 6*8, vl);
+      vint16m1_t row7 = vle16_v_i16m1(data + 7*8, vl);
+
+      // even part
+      vint16m1_t sum26 = vadd_vv_i16m1(row2, row6, vl);
+      vint32m2_t p1e = vwmul_vx_i32m2(sum26, rot0_0, vl);
+      vint32m2_t t2e = vwmacc_vx_i32m2(p1e, rot0_1, row6, vl);
+      vint32m2_t t3e = vwmacc_vx_i32m2(p1e, rot0_2, row2, vl);
+      vint32m2_t t0e = vsll_vx_i32m2(vwadd_vv_i32m2(row0, row4, vl), 12, vl);
+      vint32m2_t t1e = vsll_vx_i32m2(vwsub_vv_i32m2(row0, row4, vl), 12, vl);
+      vint32m2_t x0 = vadd_vv_i32m2(t0e, t3e, vl);
+      vint32m2_t x3 = vsub_vv_i32m2(t0e, t3e, vl);
+      vint32m2_t x1 = vadd_vv_i32m2(t1e, t2e, vl);
+      vint32m2_t x2 = vsub_vv_i32m2(t1e, t2e, vl);
+
+      // odd part
+      vint16m1_t sum15 = vadd_vv_i16m1(row1, row5, vl);
+      vint16m1_t sum17 = vadd_vv_i16m1(row1, row7, vl);
+      vint16m1_t sum35 = vadd_vv_i16m1(row3, row5, vl);
+      vint16m1_t sum37 = vadd_vv_i16m1(row3, row7, vl);
+      vint32m2_t p5o = vwmul_vx_i32m2(vadd_vv_i16m1(sum17, sum35, vl), rot1_0, vl);
+      vint32m2_t p1o = vwmacc_vx_i32m2(p5o, rot1_1, sum17, vl);
+      vint32m2_t p2o = vwmacc_vx_i32m2(p5o, rot1_2, sum35, vl);
+      vint32m2_t p3o = vwmul_vx_i32m2(sum37, rot2_0, vl);
+      vint32m2_t p4o = vwmul_vx_i32m2(sum15, rot2_1, vl);
+      vint32m2_t sump13o = vadd_vv_i32m2(p1o, p3o, vl);
+      vint32m2_t sump24o = vadd_vv_i32m2(p2o, p4o, vl);
+      vint32m2_t sump23o = vadd_vv_i32m2(p2o, p3o, vl);
+      vint32m2_t sump14o = vadd_vv_i32m2(p1o, p4o, vl);
+      vint32m2_t x4 = vwmacc_vx_i32m2(sump13o, rot3_0, row7, vl);
+      vint32m2_t x5 = vwmacc_vx_i32m2(sump24o, rot3_1, row5, vl);
+      vint32m2_t x6 = vwmacc_vx_i32m2(sump23o, rot3_2, row3, vl);
+      vint32m2_t x7 = vwmacc_vx_i32m2(sump14o, rot3_3, row1, vl);
+
+      // bfly32
+      x0 = vadd_vx_i32m2(x0, 512, vl);
+      x1 = vadd_vx_i32m2(x1, 512, vl);
+      x2 = vadd_vx_i32m2(x2, 512, vl);
+      x3 = vadd_vx_i32m2(x3, 512, vl);
+      vint16m1_t out0 = vnsra_wx_i16m1(vadd_vv_i32m2(x0, x7, vl), 10, vl);
+      vint16m1_t out7 = vnsra_wx_i16m1(vsub_vv_i32m2(x0, x7, vl), 10, vl);
+      vint16m1_t out1 = vnsra_wx_i16m1(vadd_vv_i32m2(x1, x6, vl), 10, vl);
+      vint16m1_t out6 = vnsra_wx_i16m1(vsub_vv_i32m2(x1, x6, vl), 10, vl);
+      vint16m1_t out2 = vnsra_wx_i16m1(vadd_vv_i32m2(x2, x5, vl), 10, vl);
+      vint16m1_t out5 = vnsra_wx_i16m1(vsub_vv_i32m2(x2, x5, vl), 10, vl);
+      vint16m1_t out3 = vnsra_wx_i16m1(vadd_vv_i32m2(x3, x4, vl), 10, vl);
+      vint16m1_t out4 = vnsra_wx_i16m1(vsub_vv_i32m2(x3, x4, vl), 10, vl);
+
+      // 8x8 transpose
+      // I would prefer to implement this transpose in register without save+load,
+      // but rvv does not have shuffle/zip instructions among multiple registers.
+      // what a pity :(   --- nihui
+      vsse16_v_i16m1(tmp + 0, 8 * 2, out0, vl);
+      vsse16_v_i16m1(tmp + 1, 8 * 2, out1, vl);
+      vsse16_v_i16m1(tmp + 2, 8 * 2, out2, vl);
+      vsse16_v_i16m1(tmp + 3, 8 * 2, out3, vl);
+      vsse16_v_i16m1(tmp + 4, 8 * 2, out4, vl);
+      vsse16_v_i16m1(tmp + 5, 8 * 2, out5, vl);
+      vsse16_v_i16m1(tmp + 6, 8 * 2, out6, vl);
+      vsse16_v_i16m1(tmp + 7, 8 * 2, out7, vl);
+   }
+
+   // row pass
+   {
+      vint16m1_t row0 = vle16_v_i16m1(tmp + 0*8, vl);
+      vint16m1_t row1 = vle16_v_i16m1(tmp + 1*8, vl);
+      vint16m1_t row2 = vle16_v_i16m1(tmp + 2*8, vl);
+      vint16m1_t row3 = vle16_v_i16m1(tmp + 3*8, vl);
+      vint16m1_t row4 = vle16_v_i16m1(tmp + 4*8, vl);
+      vint16m1_t row5 = vle16_v_i16m1(tmp + 5*8, vl);
+      vint16m1_t row6 = vle16_v_i16m1(tmp + 6*8, vl);
+      vint16m1_t row7 = vle16_v_i16m1(tmp + 7*8, vl);
+
+      // even part
+      vint16m1_t sum26 = vadd_vv_i16m1(row2, row6, vl);
+      vint32m2_t p1e = vwmul_vx_i32m2(sum26, rot0_0, vl);
+      vint32m2_t t2e = vwmacc_vx_i32m2(p1e, rot0_1, row6, vl);
+      vint32m2_t t3e = vwmacc_vx_i32m2(p1e, rot0_2, row2, vl);
+      vint32m2_t t0e = vsll_vx_i32m2(vwadd_vv_i32m2(row0, row4, vl), 12, vl);
+      vint32m2_t t1e = vsll_vx_i32m2(vwsub_vv_i32m2(row0, row4, vl), 12, vl);
+      vint32m2_t x0 = vadd_vv_i32m2(t0e, t3e, vl);
+      vint32m2_t x3 = vsub_vv_i32m2(t0e, t3e, vl);
+      vint32m2_t x1 = vadd_vv_i32m2(t1e, t2e, vl);
+      vint32m2_t x2 = vsub_vv_i32m2(t1e, t2e, vl);
+
+      // odd part
+      vint16m1_t sum15 = vadd_vv_i16m1(row1, row5, vl);
+      vint16m1_t sum17 = vadd_vv_i16m1(row1, row7, vl);
+      vint16m1_t sum35 = vadd_vv_i16m1(row3, row5, vl);
+      vint16m1_t sum37 = vadd_vv_i16m1(row3, row7, vl);
+      vint32m2_t p5o = vwmul_vx_i32m2(vadd_vv_i16m1(sum17, sum35, vl), rot1_0, vl);
+      vint32m2_t p1o = vwmacc_vx_i32m2(p5o, rot1_1, sum17, vl);
+      vint32m2_t p2o = vwmacc_vx_i32m2(p5o, rot1_2, sum35, vl);
+      vint32m2_t p3o = vwmul_vx_i32m2(sum37, rot2_0, vl);
+      vint32m2_t p4o = vwmul_vx_i32m2(sum15, rot2_1, vl);
+      vint32m2_t sump13o = vadd_vv_i32m2(p1o, p3o, vl);
+      vint32m2_t sump24o = vadd_vv_i32m2(p2o, p4o, vl);
+      vint32m2_t sump23o = vadd_vv_i32m2(p2o, p3o, vl);
+      vint32m2_t sump14o = vadd_vv_i32m2(p1o, p4o, vl);
+      vint32m2_t x4 = vwmacc_vx_i32m2(sump13o, rot3_0, row7, vl);
+      vint32m2_t x5 = vwmacc_vx_i32m2(sump24o, rot3_1, row5, vl);
+      vint32m2_t x6 = vwmacc_vx_i32m2(sump23o, rot3_2, row3, vl);
+      vint32m2_t x7 = vwmacc_vx_i32m2(sump14o, rot3_3, row1, vl);
+
+      // bfly32
+      x0 = vadd_vx_i32m2(x0, (int)(65536 + (128<<17)), vl);
+      x1 = vadd_vx_i32m2(x1, (int)(65536 + (128<<17)), vl);
+      x2 = vadd_vx_i32m2(x2, (int)(65536 + (128<<17)), vl);
+      x3 = vadd_vx_i32m2(x3, (int)(65536 + (128<<17)), vl);
+      vint16m1_t out0 = vnsra_wx_i16m1(vadd_vv_i32m2(x0, x7, vl), 17, vl);
+      vint16m1_t out7 = vnsra_wx_i16m1(vsub_vv_i32m2(x0, x7, vl), 17, vl);
+      vint16m1_t out1 = vnsra_wx_i16m1(vadd_vv_i32m2(x1, x6, vl), 17, vl);
+      vint16m1_t out6 = vnsra_wx_i16m1(vsub_vv_i32m2(x1, x6, vl), 17, vl);
+      vint16m1_t out2 = vnsra_wx_i16m1(vadd_vv_i32m2(x2, x5, vl), 17, vl);
+      vint16m1_t out5 = vnsra_wx_i16m1(vsub_vv_i32m2(x2, x5, vl), 17, vl);
+      vint16m1_t out3 = vnsra_wx_i16m1(vadd_vv_i32m2(x3, x4, vl), 17, vl);
+      vint16m1_t out4 = vnsra_wx_i16m1(vsub_vv_i32m2(x3, x4, vl), 17, vl);
+
+      // clamp 0~255
+      vuint8m1_t out0u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out0, 0, vl)), 0, vl);
+      vuint8m1_t out7u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out7, 0, vl)), 0, vl);
+      vuint8m1_t out1u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out1, 0, vl)), 0, vl);
+      vuint8m1_t out6u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out6, 0, vl)), 0, vl);
+      vuint8m1_t out2u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out2, 0, vl)), 0, vl);
+      vuint8m1_t out5u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out5, 0, vl)), 0, vl);
+      vuint8m1_t out3u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out3, 0, vl)), 0, vl);
+      vuint8m1_t out4u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out4, 0, vl)), 0, vl);
+
+      // 8x8 transpose
+      vsse8_v_u8m1(out + 0, out_stride, out0u8, vl);
+      vsse8_v_u8m1(out + 1, out_stride, out1u8, vl);
+      vsse8_v_u8m1(out + 2, out_stride, out2u8, vl);
+      vsse8_v_u8m1(out + 3, out_stride, out3u8, vl);
+      vsse8_v_u8m1(out + 4, out_stride, out4u8, vl);
+      vsse8_v_u8m1(out + 5, out_stride, out5u8, vl);
+      vsse8_v_u8m1(out + 6, out_stride, out6u8, vl);
+      vsse8_v_u8m1(out + 7, out_stride, out7u8, vl);
+   }
+}
+
+#endif // STBI_RVV
+
 #define STBI__MARKER_none  0xff
 // if there's a pending marker from the entropy stream, return that
 // otherwise, fetch from the stream and get a marker. if there's no
@@ -3524,7 +3712,7 @@ static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc
    return out;
 }
 
-#if defined(STBI_SSE2) || defined(STBI_NEON)
+#if defined(STBI_SSE2) || defined(STBI_NEON) || defined(STBI_RVV)
 static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 {
    // need to generate 2x2 samples for every one in input
@@ -3536,6 +3724,48 @@ static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stb
    }
 
    t1 = 3*in_near[0] + in_far[0];
+#if defined(STBI_RVV)
+   // process groups of vl*4 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   int n = w-1;
+   while (n > 0) {
+      size_t vl = vsetvl_e8m4(n);
+
+      // load and perform the vertical filtering pass
+      vuint8m4_t farb  = vle8_v_u8m4(in_far + i, vl);
+      vuint8m4_t nearb = vle8_v_u8m4(in_near + i, vl);
+      vuint16m8_t curr  = vadd_vv_u16m8(vwmulu_vx_u16m8(nearb, 3, vl), vwcvtu_x_x_v_u16m8(farb, vl), vl); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of vl*4 pixels added in.
+      vuint16m8_t prev = vslide1up_vx_u16m8(curr, t1, vl);
+      vuint16m8_t next = vslide1down_vx_u16m8(curr, 3*in_near[i+vl] + in_far[i+vl], vl);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev + 8
+      // odd  pixels = 3*cur + next + 8
+      // note the shared term.
+      vuint16m8_t curs = vmacc_vx_u16m8(vmv_v_x_u16m8(8, vl), 3, curr, vl);
+      vuint16m8_t even = vadd_vv_u16m8(curs, prev, vl);
+      vuint16m8_t odd  = vadd_vv_u16m8(curs, next, vl);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      vuint8m4_t evenu8 = vnclipu_wx_u8m4(vsrl_vx_u16m8(even, 4, vl), 0, vl);
+      vuint8m4_t oddu8 = vnclipu_wx_u8m4(vsrl_vx_u16m8(odd, 4, vl), 0, vl);
+      vuint8m4x2_t o = vcreate_u8m4x2(evenu8, oddu8);
+      vsseg2e8_v_u8m4x2(out + i*2, o, vl);
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+(vl-1)] + in_far[i+(vl-1)];
+
+      i += vl;
+      n -= vl;
+   }
+#else
    // process groups of 8 pixels for as long as we can.
    // note we can't handle the last pixel in a row in this loop
    // because we need to handle the filter boundary conditions.
@@ -3622,6 +3852,7 @@ static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stb
       // "previous" value for next iter
       t1 = 3*in_near[i+7] + in_far[i+7];
    }
+#endif
 
    t0 = t1;
    t1 = 3*in_near[i] + in_far[i];
@@ -3680,7 +3911,7 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
    }
 }
 
-#if defined(STBI_SSE2) || defined(STBI_NEON)
+#if defined(STBI_SSE2) || defined(STBI_NEON) || defined(STBI_RVV)
 static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
 {
    int i = 0;
@@ -3747,7 +3978,47 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
 #endif
 
 #ifdef STBI_NEON
-   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 3) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x3_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+
+         // store, interleaving r/g/b
+         vst3_u8(out, o);
+         out += 8*3;
+      }
+   }
    if (step == 4) {
       // this is a fairly straightforward implementation and not super-optimized.
       uint8x8_t signflip = vdup_n_u8(0x80);
@@ -3792,6 +4063,104 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
    }
 #endif
 
+#ifdef STBI_RVV
+   if (step == 3) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      const unsigned char signflip = 0x80;
+      const short cr_const0 =    (short) ( 1.40200f*4096.0f+0.5f);
+      const short cr_const1 =  - (short) ( 0.71414f*4096.0f+0.5f);
+      const short cb_const0 =  - (short) ( 0.34414f*4096.0f+0.5f);
+      const short cb_const1 =    (short) ( 1.77200f*4096.0f+0.5f);
+
+      int n = count;
+      while (n > 0) {
+         size_t vl = vsetvl_e8m2(n);
+
+         // load
+         vuint8m2_t y_bytes  = vle8_v_u8m2(y + i, vl);
+         vuint8m2_t cr_bytes = vle8_v_u8m2(pcr + i, vl);
+         vuint8m2_t cb_bytes = vle8_v_u8m2(pcb + i, vl);
+         vint8m2_t cr_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cr_bytes, signflip, vl));
+         vint8m2_t cb_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cb_bytes, signflip, vl));
+
+         // expand to s16
+         vint16m4_t yws = vadd_vx_i16m4(vsll_vx_i16m4(vreinterpret_v_u16m4_i16m4(vwcvtu_x_x_v_u16m4(y_bytes, vl)), 4, vl), 8, vl);
+         vint16m4_t crw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cr_biased, vl), 8, vl);
+         vint16m4_t cbw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cb_biased, vl), 8, vl);
+
+         // color transform
+         vint16m4_t cr0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const0, vl), 16, vl), 0, vl);
+         vint16m4_t cb0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const0, vl), 16, vl), 0, vl);
+         vint16m4_t cr1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const1, vl), 16, vl), 0, vl);
+         vint16m4_t cb1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const1, vl), 16, vl), 0, vl);
+         vint16m4_t rws = vadd_vv_i16m4(yws, cr0, vl);
+         vint16m4_t gws = vadd_vv_i16m4(vadd_vv_i16m4(yws, cb0, vl), cr1, vl);
+         vint16m4_t bws = vadd_vv_i16m4(yws, cb1, vl);
+
+         // undo scaling, round, convert to byte
+         vuint8m2_t rb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(rws, 4, vl), 0, vl)), 0, vl);
+         vuint8m2_t gb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(gws, 4, vl), 0, vl)), 0, vl);
+         vuint8m2_t bb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(bws, 4, vl), 0, vl)), 0, vl);
+
+         // store, interleaving r/g/b
+         vuint8m2x3_t o = vcreate_u8m2x3(rb, gb, bb);
+         vsseg3e8_v_u8m2x3(out, o, vl);
+         out += vl*3;
+
+         i += vl;
+         n -= vl;
+      }
+   }
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      const unsigned char signflip = 128;
+      const short cr_const0 =    (short) ( 1.40200f*4096.0f+0.5f);
+      const short cr_const1 =  - (short) ( 0.71414f*4096.0f+0.5f);
+      const short cb_const0 =  - (short) ( 0.34414f*4096.0f+0.5f);
+      const short cb_const1 =    (short) ( 1.77200f*4096.0f+0.5f);
+
+      int n = count;
+      while (n > 0) {
+         size_t vl = vsetvl_e8m1(n);
+
+         // load
+         vuint8m2_t y_bytes  = vle8_v_u8m2(y + i, vl);
+         vuint8m2_t cr_bytes = vle8_v_u8m2(pcr + i, vl);
+         vuint8m2_t cb_bytes = vle8_v_u8m2(pcb + i, vl);
+         vint8m2_t cr_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cr_bytes, signflip, vl));
+         vint8m2_t cb_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cb_bytes, signflip, vl));
+
+         // expand to s16
+         vint16m4_t yws = vadd_vx_i16m4(vsll_vx_i16m4(vreinterpret_v_u16m4_i16m4(vwcvtu_x_x_v_u16m4(y_bytes, vl)), 4, vl), 8, vl);
+         vint16m4_t crw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cr_biased, vl), 8, vl);
+         vint16m4_t cbw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cb_biased, vl), 8, vl);
+
+         // color transform
+         vint16m4_t cr0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const0, vl), 16, vl), 0, vl);
+         vint16m4_t cb0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const0, vl), 16, vl), 0, vl);
+         vint16m4_t cr1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const1, vl), 16, vl), 0, vl);
+         vint16m4_t cb1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const1, vl), 16, vl), 0, vl);
+         vint16m4_t rws = vadd_vv_i16m4(yws, cr0, vl);
+         vint16m4_t gws = vadd_vv_i16m4(vadd_vv_i16m4(yws, cb0, vl), cr1, vl);
+         vint16m4_t bws = vadd_vv_i16m4(yws, cb1, vl);
+
+         // undo scaling, round, convert to byte
+         vuint8m2_t rb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(rws, 4, vl), 0, vl)), 0, vl);
+         vuint8m2_t gb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(gws, 4, vl), 0, vl)), 0, vl);
+         vuint8m2_t bb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(bws, 4, vl), 0, vl)), 0, vl);
+         vuint8m2_t ab = vmv_v_x_u8m2(255, vl);
+
+         // store, interleaving r/g/b/a
+         vuint8m2x4_t o = vcreate_u8m2x4(rb, gb, bb, ab);
+         vsseg4e8_v_u8m2x4(out, o, vl);
+         out += vl*4;
+
+         i += vl;
+         n -= vl;
+      }
+   }
+#endif
+
    for (; i < count; ++i) {
       int y_fixed = (y[i] << 20) + (1<<19); // rounding
       int r,g,b;
@@ -3835,6 +4204,12 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 #endif
+
+#ifdef STBI_RVV
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
 }
 
 // clean up the temporary component buffers
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index bef56d44a58..d30229b870c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,9 +4,12 @@ if(MSVC)
     add_definitions(/wd4996)
 endif()
 
+add_library(ncnntestutil STATIC testutil.cpp)
+target_link_libraries(ncnntestutil PUBLIC ncnn)
+
 macro(ncnn_add_test name)
     add_executable(test_${name} test_${name}.cpp)
-    target_link_libraries(test_${name} PRIVATE ncnn)
+    target_link_libraries(test_${name} PRIVATE ncnntestutil ncnn)
 
     add_test(NAME test_${name} COMMAND ${CMAKE_COMMAND} -DTEST_EXECUTABLE=$<TARGET_FILE:test_${name}> -P ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/run_test.cmake)
 
@@ -24,7 +27,7 @@ macro(ncnn_add_layer_test class)
         foreach(test_file ${test_${name}_SRCS})
             get_filename_component(test_filename ${test_file} NAME_WE)
             add_executable(${test_filename} ${test_file})
-            target_link_libraries(${test_filename} PRIVATE ncnn)
+            target_link_libraries(${test_filename} PRIVATE ncnntestutil ncnn)
 
             add_test(NAME ${test_filename} COMMAND ${CMAKE_COMMAND} -DTEST_EXECUTABLE=$<TARGET_FILE:${test_filename}> -P ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/run_test.cmake)
 
diff --git a/tests/test_absval.cpp b/tests/test_absval.cpp
index 6312a3b01be..e931606be13 100644
--- a/tests/test_absval.cpp
+++ b/tests/test_absval.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/absval.h"
 #include "testutil.h"
 
 static int test_absval(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_absval(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::AbsVal>("AbsVal", pd, weights, a);
+    int ret = test_layer("AbsVal", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_absval failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_batchnorm.cpp b/tests/test_batchnorm.cpp
index 8e86e5b48da..a977a33640d 100644
--- a/tests/test_batchnorm.cpp
+++ b/tests/test_batchnorm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/batchnorm.h"
 #include "testutil.h"
 
 static int test_batchnorm(const ncnn::Mat& a, float eps)
@@ -35,7 +34,7 @@ static int test_batchnorm(const ncnn::Mat& a, float eps)
     // var must be positive
     Randomize(weights[2], 0.001f, 2.f);
 
-    int ret = test_layer<ncnn::BatchNorm>("BatchNorm", pd, weights, a);
+    int ret = test_layer("BatchNorm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_batchnorm failed a.dims=%d a=(%d %d %d %d) eps=%f\n", a.dims, a.w, a.h, a.d, a.c, eps);
diff --git a/tests/test_bias.cpp b/tests/test_bias.cpp
index fe71820968f..d522a950bf7 100644
--- a/tests/test_bias.cpp
+++ b/tests/test_bias.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/bias.h"
 #include "testutil.h"
 
 static int test_bias(const ncnn::Mat& a)
@@ -25,7 +24,7 @@ static int test_bias(const ncnn::Mat& a)
     std::vector<ncnn::Mat> weights(1);
     weights[0] = RandomMat(channels);
 
-    int ret = test_layer<ncnn::Bias>("Bias", pd, weights, a);
+    int ret = test_layer("Bias", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_bias failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_binaryop.cpp b/tests/test_binaryop.cpp
index 89f953eaccb..4a0552d8b7a 100644
--- a/tests/test_binaryop.cpp
+++ b/tests/test_binaryop.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/binaryop.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 12
@@ -67,7 +66,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag)
     ab[0] = a;
     ab[1] = b;
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
@@ -109,7 +108,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b, int flag)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
diff --git a/tests/test_binaryop_1.cpp b/tests/test_binaryop_1.cpp
index d6b20ede1a8..d899932291f 100644
--- a/tests/test_binaryop_1.cpp
+++ b/tests/test_binaryop_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/binaryop.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 12
@@ -67,7 +66,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag)
     ab[0] = a;
     ab[1] = b;
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
@@ -109,7 +108,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b, int flag)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
diff --git a/tests/test_binaryop_2.cpp b/tests/test_binaryop_2.cpp
index 14c5e7d3dac..3427f7af248 100644
--- a/tests/test_binaryop_2.cpp
+++ b/tests/test_binaryop_2.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/binaryop.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 12
@@ -67,7 +66,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag)
     ab[0] = a;
     ab[1] = b;
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
@@ -109,7 +108,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b, int flag)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
diff --git a/tests/test_binaryop_3.cpp b/tests/test_binaryop_3.cpp
index 655c2a3ce91..7509fc6aae2 100644
--- a/tests/test_binaryop_3.cpp
+++ b/tests/test_binaryop_3.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/binaryop.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 12
@@ -67,7 +66,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag)
     ab[0] = a;
     ab[1] = b;
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
@@ -109,7 +108,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b, int flag)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
diff --git a/tests/test_bnll.cpp b/tests/test_bnll.cpp
index 2bb35376200..0b22fda679a 100644
--- a/tests/test_bnll.cpp
+++ b/tests/test_bnll.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/bnll.h"
 #include "testutil.h"
 
 static int test_bnll(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_bnll(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::BNLL>("BNLL", pd, weights, a);
+    int ret = test_layer("BNLL", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_bnll failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_cast.cpp b/tests/test_cast.cpp
index 0470c4e1ab0..fa44f06bc8c 100644
--- a/tests/test_cast.cpp
+++ b/tests/test_cast.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/cast.h"
 #include "testutil.h"
 
 static int cast_cpu_naive(const ncnn::Mat& a, ncnn::Mat& b, int type_from, int type_to)
@@ -25,11 +24,8 @@ static int cast_cpu_naive(const ncnn::Mat& a, ncnn::Mat& b, int type_from, int t
 
     ncnn::Option opt;
     opt.num_threads = 1;
-    opt.use_vulkan_compute = false;
-    opt.use_int8_inference = false;
-    opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_naive("Cast");
 
     op->load_param(pd);
 
@@ -39,7 +35,7 @@ static int cast_cpu_naive(const ncnn::Mat& a, ncnn::Mat& b, int type_from, int t
 
     op->create_pipeline(opt);
 
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a, b, opt);
+    op->forward(a, b, opt);
 
     op->destroy_pipeline(opt);
 
@@ -62,7 +58,7 @@ static int test_cast_cpu(const ncnn::Mat& a, int type_from, int type_to)
     opt.use_int8_inference = false;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_cpu("Cast");
 
     op->load_param(pd);
 
@@ -76,7 +72,7 @@ static int test_cast_cpu(const ncnn::Mat& a, int type_from, int type_to)
     cast_cpu_naive(a, a_fp16, 1, type_from);
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat c;
     op->forward(a_fp16, c, opt);
@@ -107,7 +103,7 @@ static int test_cast_cpu_packed(const ncnn::Mat& a, int type_from, int type_to)
     opt.use_vulkan_compute = false;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_cpu("Cast");
 
     op->load_param(pd);
 
@@ -121,7 +117,7 @@ static int test_cast_cpu_packed(const ncnn::Mat& a, int type_from, int type_to)
     cast_cpu_naive(a, a_fp16, 1, type_from);
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat a4;
     ncnn::convert_packing(a, a4, 4, opt);
@@ -180,7 +176,7 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to)
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
 
     op->vkdev = vkdev;
 
@@ -203,7 +199,7 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to)
     }
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat d;
 
@@ -296,7 +292,7 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
 
     op->vkdev = vkdev;
 
@@ -319,7 +315,7 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type
     }
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat d;
 
@@ -413,7 +409,7 @@ static int test_cast_gpu_image_fp16p(const ncnn::Mat& a, int type_from, int type
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
 
     op->vkdev = vkdev;
 
@@ -436,7 +432,7 @@ static int test_cast_gpu_image_fp16p(const ncnn::Mat& a, int type_from, int type
     }
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat d;
 
@@ -529,7 +525,7 @@ static int test_cast_gpu_image_fp16p_pack8(const ncnn::Mat& a, int type_from, in
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
 
     op->vkdev = vkdev;
 
@@ -552,7 +548,7 @@ static int test_cast_gpu_image_fp16p_pack8(const ncnn::Mat& a, int type_from, in
     }
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat d;
 
diff --git a/tests/test_celu.cpp b/tests/test_celu.cpp
index 703864eabdc..36a11ccd021 100644
--- a/tests/test_celu.cpp
+++ b/tests/test_celu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/celu.h"
 #include "testutil.h"
 
 static int test_celu(const ncnn::Mat& a, float alpha)
@@ -22,7 +21,7 @@ static int test_celu(const ncnn::Mat& a, float alpha)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::CELU>("CELU", pd, weights, a);
+    int ret = test_layer("CELU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_celu failed a.dims=%d a=(%d %d %d %d) alpha=%f\n", a.dims, a.w, a.h, a.d, a.c, alpha);
diff --git a/tests/test_clip.cpp b/tests/test_clip.cpp
index 553085e2d63..72a35fcf2cc 100644
--- a/tests/test_clip.cpp
+++ b/tests/test_clip.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/clip.h"
 #include "testutil.h"
 
 static int test_clip(const ncnn::Mat& a, float min, float max)
@@ -23,7 +22,7 @@ static int test_clip(const ncnn::Mat& a, float min, float max)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Clip>("Clip", pd, weights, a);
+    int ret = test_layer("Clip", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_clip failed a.dims=%d a=(%d %d %d %d) min=%f max=%f\n", a.dims, a.w, a.h, a.d, a.c, min, max);
diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index 3ba621c110f..c4931383b4c 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/concat.h"
 #include "testutil.h"
 
 static int test_concat(const std::vector<ncnn::Mat>& a, int axis)
@@ -22,7 +21,7 @@ static int test_concat(const std::vector<ncnn::Mat>& a, int axis)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Concat>("Concat", pd, weights, a);
+    int ret = test_layer("Concat", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_concat failed a[0].dims=%d a[0]=(%d %d %d %d) axis=%d\n", a[0].dims, a[0].w, a[0].h, a[0].d, a[0].c, axis);
diff --git a/tests/test_convolution.cpp b/tests/test_convolution.cpp
index 394ca0e8d57..b37634fc333 100644
--- a/tests/test_convolution.cpp
+++ b/tests/test_convolution.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution.h"
 #include "testutil.h"
 
 static int test_convolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -42,7 +41,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
 
     float epsilon = 0.001;
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, epsilon);
+    int ret = test_layer("Convolution", pd, weights, a, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -62,7 +61,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -83,7 +82,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -97,7 +96,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.num_threads = 1;
         opt.use_a53_a55_optimized_kernel = true;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolution1d.cpp b/tests/test_convolution1d.cpp
index bea75da301c..1b194b56236 100644
--- a/tests/test_convolution1d.cpp
+++ b/tests/test_convolution1d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution1d.h"
 #include "testutil.h"
 
 static int test_convolution1d(int w, int h, int outh, int kernel, int dilation, int stride, int pad, int bias)
@@ -40,7 +39,7 @@ static int test_convolution1d(int w, int h, int outh, int kernel, int dilation,
     if (bias)
         weights[1] = RandomMat(outh);
 
-    int ret = test_layer<ncnn::Convolution1D>("Convolution1D", pd, weights, a);
+    int ret = test_layer("Convolution1D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution1d failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, outh, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -167,7 +166,7 @@ static int test_convolution1d_dynamic(int w, int h, int outh, int kernel, int di
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Convolution1D>("Convolution1D", pd, weights, as);
+    int ret = test_layer("Convolution1D", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution1d_dynamic failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, outh, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolution3d.cpp b/tests/test_convolution3d.cpp
index 2cd752982e7..ba1ca745478 100644
--- a/tests/test_convolution3d.cpp
+++ b/tests/test_convolution3d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution3d.h"
 #include "testutil.h"
 
 static int test_convolution3d(int w, int h, int d, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -40,7 +39,7 @@ static int test_convolution3d(int w, int h, int d, int c, int outch, int kernel,
     if (bias)
         weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::Convolution3D>("Convolution3D", pd, weights, a);
+    int ret = test_layer("Convolution3D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution3d failed w=%d h=%d d=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, d, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolution_1.cpp b/tests/test_convolution_1.cpp
index 22f634247a3..77dd6dab1eb 100644
--- a/tests/test_convolution_1.cpp
+++ b/tests/test_convolution_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution.h"
 #include "testutil.h"
 
 static int test_convolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -42,7 +41,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
 
     float epsilon = 0.001;
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, epsilon);
+    int ret = test_layer("Convolution", pd, weights, a, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -62,7 +61,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -83,7 +82,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -97,7 +96,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.num_threads = 1;
         opt.use_a53_a55_optimized_kernel = true;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolution_2.cpp b/tests/test_convolution_2.cpp
index cff52d77fbd..5135f5bd780 100644
--- a/tests/test_convolution_2.cpp
+++ b/tests/test_convolution_2.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution.h"
 #include "testutil.h"
 
 static int test_convolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -44,7 +43,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
     Randomize(weights[0], -0.6, 0.6);
     float epsilon = 0.001;
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, epsilon);
+    int ret = test_layer("Convolution", pd, weights, a, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -64,7 +63,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -85,7 +84,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -98,7 +97,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.num_threads = 1;
         opt.use_a53_a55_optimized_kernel = true;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolution_3.cpp b/tests/test_convolution_3.cpp
index b013380b512..d8d34818688 100644
--- a/tests/test_convolution_3.cpp
+++ b/tests/test_convolution_3.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution.h"
 #include "testutil.h"
 
 static int test_convolution_vec(int w, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -40,7 +39,7 @@ static int test_convolution_vec(int w, int outch, int kernel, int dilation, int
     if (bias)
         weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a);
+    int ret = test_layer("Convolution", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution_vec failed w=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -93,7 +92,7 @@ static int test_convolution_dynamic(int w, int h, int c, int outch, int kernel,
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, as);
+    int ret = test_layer("Convolution", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -168,6 +167,14 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
     ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel);
     ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
     ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
+
+    if (kernel == 3 && dilation == 1 && stride == 1)
+    {
+        // test for 6bit quant
+        for (int i = 0; i < weight_scales.w; i++)
+            weight_scales[i] = weight_scales[i] / 4.f;
+    }
+
     if (bias)
     {
         weights[1] = RandomMat(outch);
@@ -183,7 +190,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
     }
 
     int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
+    int ret = test_layer("Convolution", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
@@ -206,7 +213,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_winograd23_convolution = true;
         opt.use_winograd43_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
@@ -227,7 +234,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
@@ -248,7 +255,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
@@ -269,7 +276,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolutiondepthwise.cpp b/tests/test_convolutiondepthwise.cpp
index 715fc73662c..dcded4f98da 100644
--- a/tests/test_convolutiondepthwise.cpp
+++ b/tests/test_convolutiondepthwise.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolutiondepthwise.h"
 #include "testutil.h"
 
 static int test_convolutiondepthwise(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group)
@@ -40,7 +39,7 @@ static int test_convolutiondepthwise(int w, int h, int c, int outch, int kernel,
     weights[0] = RandomMat(outch / group * c / group * kernel * kernel * group);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::ConvolutionDepthWise>("ConvolutionDepthWise", pd, weights, a);
+    int ret = test_layer("ConvolutionDepthWise", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolutiondepthwise1d.cpp b/tests/test_convolutiondepthwise1d.cpp
index bb80e80d985..fe7fc254312 100644
--- a/tests/test_convolutiondepthwise1d.cpp
+++ b/tests/test_convolutiondepthwise1d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolutiondepthwise1d.h"
 #include "testutil.h"
 
 static int test_convolutiondepthwise1d(int w, int h, int outh, int kernel, int dilation, int stride, int pad, int bias, int group)
@@ -40,7 +39,7 @@ static int test_convolutiondepthwise1d(int w, int h, int outh, int kernel, int d
     weights[0] = RandomMat(outh / group * h / group * kernel * kernel * group);
     weights[1] = RandomMat(outh);
 
-    int ret = test_layer<ncnn::ConvolutionDepthWise1D>("ConvolutionDepthWise1D", pd, weights, a);
+    int ret = test_layer("ConvolutionDepthWise1D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise1d failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, outh, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
@@ -155,7 +154,7 @@ static int test_convolutiondepthwise1d_dynamic(int w, int h, int outh, int kerne
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ConvolutionDepthWise1D>("ConvolutionDepthWise1D", pd, weights, as);
+    int ret = test_layer("ConvolutionDepthWise1D", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise1d_dynamic failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, outh, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolutiondepthwise3d.cpp b/tests/test_convolutiondepthwise3d.cpp
index 7ecb51447b3..ad392532acf 100644
--- a/tests/test_convolutiondepthwise3d.cpp
+++ b/tests/test_convolutiondepthwise3d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolutiondepthwise3d.h"
 #include "testutil.h"
 
 static int test_convolutiondepthwise3d(int w, int h, int d, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group)
@@ -40,7 +39,7 @@ static int test_convolutiondepthwise3d(int w, int h, int d, int c, int outch, in
     weights[0] = RandomMat(outch / group * c / group * kernel * kernel * kernel * group);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::ConvolutionDepthWise3D>("ConvolutionDepthWise3D", pd, weights, a);
+    int ret = test_layer("ConvolutionDepthWise3D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise3d failed w=%d h=%d d=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, d, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolutiondepthwise_1.cpp b/tests/test_convolutiondepthwise_1.cpp
index 3d10a7a8e85..e1429e30c05 100644
--- a/tests/test_convolutiondepthwise_1.cpp
+++ b/tests/test_convolutiondepthwise_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolutiondepthwise.h"
 #include "testutil.h"
 
 static int test_convolutiondepthwise_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group)
@@ -45,7 +44,7 @@ static int test_convolutiondepthwise_dynamic(int w, int h, int c, int outch, int
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ConvolutionDepthWise>("ConvolutionDepthWise", pd, weights, as);
+    int ret = test_layer("ConvolutionDepthWise", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
@@ -138,7 +137,7 @@ static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int ke
     }
 
     int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::ConvolutionDepthWise>("ConvolutionDepthWise", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
+    int ret = test_layer("ConvolutionDepthWise", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, requant, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_copyto.cpp b/tests/test_copyto.cpp
index 93a42fd4231..78288663035 100644
--- a/tests/test_copyto.cpp
+++ b/tests/test_copyto.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/copyto.h"
 #include "testutil.h"
 
 static int test_copyto(const ncnn::Mat& self, const ncnn::Mat& src, int woffset, int hoffset, int doffset, int coffset)
@@ -29,7 +28,7 @@ static int test_copyto(const ncnn::Mat& self, const ncnn::Mat& src, int woffset,
     as[0] = self;
     as[1] = src;
 
-    int ret = test_layer<ncnn::CopyTo>("CopyTo", pd, weights, as, 1);
+    int ret = test_layer("CopyTo", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_copyto failed self.dims=%d self=(%d %d %d %d) src.dims=%d src=(%d %d %d %d) woffset=%d hoffset=%d doffset=%d coffset=%d\n", self.dims, self.w, self.h, self.d, self.c, src.dims, src.w, src.h, src.d, src.c, woffset, hoffset, doffset, coffset);
diff --git a/tests/test_copyto_1.cpp b/tests/test_copyto_1.cpp
index 6d5cc220fe2..a381cdabf51 100644
--- a/tests/test_copyto_1.cpp
+++ b/tests/test_copyto_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/copyto.h"
 #include "testutil.h"
 
 static ncnn::Mat IntArrayMat(int a0)
@@ -77,7 +76,7 @@ static int test_copyto(const ncnn::Mat& self, const ncnn::Mat& src, const ncnn::
     as[0] = self;
     as[1] = src;
 
-    int ret = test_layer<ncnn::CopyTo>("CopyTo", pd, weights, as, 1);
+    int ret = test_layer("CopyTo", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_copyto failed self.dims=%d self=(%d %d %d %d) src.dims=%d src=(%d %d %d %d)", self.dims, self.w, self.h, self.d, self.c, src.dims, src.w, src.h, src.d, src.c);
diff --git a/tests/test_crop.cpp b/tests/test_crop.cpp
index b2a29778fec..d2a03eb538d 100644
--- a/tests/test_crop.cpp
+++ b/tests/test_crop.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/crop.h"
 #include "testutil.h"
 
 static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset, int coffset, int outw, int outh, int outd, int outc, int woffset2, int hoffset2, int doffset2, int coffset2)
@@ -33,7 +32,7 @@ static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset,
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Crop>("Crop", pd, weights, a);
+    int ret = test_layer("Crop", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d) woffset=%d hoffset=%d doffset=%d coffset=%d outw=%d outh=%d outd=%d outc=%d woffset2=%d hoffset2=%d doffset2=%d coffset2=%d\n", a.dims, a.w, a.h, a.d, a.c, woffset, hoffset, doffset, coffset, outw, outh, outd, outc, woffset2, hoffset2, doffset2, coffset2);
diff --git a/tests/test_crop_1.cpp b/tests/test_crop_1.cpp
index c875a51c7fa..3064dc1de69 100644
--- a/tests/test_crop_1.cpp
+++ b/tests/test_crop_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/crop.h"
 #include "testutil.h"
 
 static ncnn::Mat IntArrayMat(int a0)
@@ -74,7 +73,7 @@ static int test_crop(const ncnn::Mat& a, const ncnn::Mat& starts, const ncnn::Ma
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Crop>("Crop", pd, weights, a);
+    int ret = test_layer("Crop", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d)", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_crop_2.cpp b/tests/test_crop_2.cpp
index 287634b973e..b896caa2e8e 100644
--- a/tests/test_crop_2.cpp
+++ b/tests/test_crop_2.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/crop.h"
 #include "testutil.h"
 
 static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset, int coffset, const ncnn::Mat& ref)
@@ -37,7 +36,7 @@ static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset,
     ab[0] = a;
     ab[1] = ref;
 
-    int ret = test_layer<ncnn::Crop>("Crop", pd, weights, ab);
+    int ret = test_layer("Crop", pd, weights, ab);
     if (ret != 0)
     {
         fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d) woffset=%d hoffset=%d doffset=%d coffset=%d ref.dims=%d ref=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c, woffset, hoffset, doffset, coffset, ref.dims, ref.w, ref.h, ref.d, ref.c);
diff --git a/tests/test_cumulativesum.cpp b/tests/test_cumulativesum.cpp
index de38ab44d98..666a5e04074 100644
--- a/tests/test_cumulativesum.cpp
+++ b/tests/test_cumulativesum.cpp
@@ -10,7 +10,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/cumulativesum.h"
 #include "testutil.h"
 
 static int test_cumulativesum(const ncnn::Mat& a, int axis)
@@ -20,7 +19,7 @@ static int test_cumulativesum(const ncnn::Mat& a, int axis)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::CumulativeSum>("CumulativeSum", pd, weights, a);
+    int ret = test_layer("CumulativeSum", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_cumulativesum failed a.dims=%d a=(%d %d %d) axis=%d\n", a.dims, a.w, a.h, a.c, axis);
diff --git a/tests/test_deconvolution.cpp b/tests/test_deconvolution.cpp
index 4a0027b68c4..5da32924644 100644
--- a/tests/test_deconvolution.cpp
+++ b/tests/test_deconvolution.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolution.h"
 #include "testutil.h"
 
 static int test_deconvolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int output_pad_right, int output_pad_bottom, int output_w, int output_h)
@@ -49,7 +48,7 @@ static int test_deconvolution(int w, int h, int c, int outch, int kernel, int di
     weights[0] = RandomMat(outch * c * kernel * kernel);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::Deconvolution>("Deconvolution", pd, weights, a);
+    int ret = test_layer("Deconvolution", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
@@ -68,7 +67,7 @@ static int test_deconvolution(int w, int h, int c, int outch, int kernel, int di
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Deconvolution>("Deconvolution", pd, weights, opt, a);
+        ret = test_layer_opt("Deconvolution", pd, weights, opt, a);
         if (ret != 0)
         {
             fprintf(stderr, "test_deconvolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
@@ -88,7 +87,7 @@ static int test_deconvolution(int w, int h, int c, int outch, int kernel, int di
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Deconvolution>("Deconvolution", pd, weights, opt, a);
+        ret = test_layer_opt("Deconvolution", pd, weights, opt, a);
         if (ret != 0)
         {
             fprintf(stderr, "test_deconvolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
@@ -190,7 +189,7 @@ static int test_deconvolution_dynamic(int w, int h, int c, int outch, int kernel
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Deconvolution>("Deconvolution", pd, weights, as);
+    int ret = test_layer("Deconvolution", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolution_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
diff --git a/tests/test_deconvolution1d.cpp b/tests/test_deconvolution1d.cpp
index b1b24ee6af9..4836173ab4a 100644
--- a/tests/test_deconvolution1d.cpp
+++ b/tests/test_deconvolution1d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolution1d.h"
 #include "testutil.h"
 
 static int test_deconvolution1d(int w, int h, int outh, int kernel, int dilation, int stride, int pad, int bias, int output_pad_right, int output_w)
@@ -47,7 +46,7 @@ static int test_deconvolution1d(int w, int h, int outh, int kernel, int dilation
     weights[0] = RandomMat(outh * h * kernel);
     weights[1] = RandomMat(outh);
 
-    int ret = test_layer<ncnn::Deconvolution1D>("Deconvolution1D", pd, weights, a);
+    int ret = test_layer("Deconvolution1D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolution1d failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_w=%d\n", w, h, outh, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_w);
@@ -138,7 +137,7 @@ static int test_deconvolution1d_dynamic(int w, int h, int outh, int kernel, int
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Deconvolution1D>("Deconvolution1D", pd, weights, as);
+    int ret = test_layer("Deconvolution1D", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolution1d_dynamic failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_w=%d\n", w, h, outh, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_w);
diff --git a/tests/test_deconvolution3d.cpp b/tests/test_deconvolution3d.cpp
index d9e8dcea268..4d1ad2f204b 100644
--- a/tests/test_deconvolution3d.cpp
+++ b/tests/test_deconvolution3d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolution3d.h"
 #include "testutil.h"
 
 static int test_deconvolution3d(int w, int h, int d, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int output_pad_right, int output_pad_bottom, int output_pad_behind, int output_w, int output_h, int output_d)
@@ -51,7 +50,7 @@ static int test_deconvolution3d(int w, int h, int d, int c, int outch, int kerne
     weights[0] = RandomMat(outch * c * kernel * kernel * kernel);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::Deconvolution3D>("Deconvolution3D", pd, weights, a);
+    int ret = test_layer("Deconvolution3D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolution3d failed w=%d h=%d d=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_pad_behind=%d output_w=%d output_h=%d output_d=%d\n", w, h, d, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_pad_behind, output_w, output_h, output_d);
diff --git a/tests/test_deconvolutiondepthwise.cpp b/tests/test_deconvolutiondepthwise.cpp
index 3c9c703002b..80b9f052d11 100644
--- a/tests/test_deconvolutiondepthwise.cpp
+++ b/tests/test_deconvolutiondepthwise.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolutiondepthwise.h"
 #include "testutil.h"
 
 static int test_deconvolutiondepthwise(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, int output_pad_right, int output_pad_bottom, int output_w, int output_h)
@@ -50,7 +49,7 @@ static int test_deconvolutiondepthwise(int w, int h, int c, int outch, int kerne
     weights[0] = RandomMat(outch / group * c / group * kernel * kernel * group);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::DeconvolutionDepthWise>("DeconvolutionDepthWise", pd, weights, a);
+    int ret = test_layer("DeconvolutionDepthWise", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolutiondepthwise failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
diff --git a/tests/test_deconvolutiondepthwise1d.cpp b/tests/test_deconvolutiondepthwise1d.cpp
index cadd149ff59..f02d021185c 100644
--- a/tests/test_deconvolutiondepthwise1d.cpp
+++ b/tests/test_deconvolutiondepthwise1d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolutiondepthwise1d.h"
 #include "testutil.h"
 
 static int test_deconvolutiondepthwise1d(int w, int h, int outh, int kernel, int dilation, int stride, int pad, int bias, int group, int output_pad_right, int output_w)
@@ -48,7 +47,7 @@ static int test_deconvolutiondepthwise1d(int w, int h, int outh, int kernel, int
     weights[0] = RandomMat(outh / group * h / group * kernel * group);
     weights[1] = RandomMat(outh);
 
-    int ret = test_layer<ncnn::DeconvolutionDepthWise1D>("DeconvolutionDepthWise1D", pd, weights, a);
+    int ret = test_layer("DeconvolutionDepthWise1D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolutiondepthwise1d failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f] output_pad_right=%d output_w=%d\n", w, h, outh, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1], output_pad_right, output_w);
@@ -145,7 +144,7 @@ static int test_deconvolutiondepthwise1d_dynamic(int w, int h, int outh, int ker
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::DeconvolutionDepthWise1D>("DeconvolutionDepthWise1D", pd, weights, as);
+    int ret = test_layer("DeconvolutionDepthWise1D", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolutiondepthwise1d_dynamic failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f] output_pad_right=%d output_w=%d\n", w, h, outh, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1], output_pad_right, output_w);
diff --git a/tests/test_deconvolutiondepthwise3d.cpp b/tests/test_deconvolutiondepthwise3d.cpp
index b57ce0add11..ff2f47193d2 100644
--- a/tests/test_deconvolutiondepthwise3d.cpp
+++ b/tests/test_deconvolutiondepthwise3d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolutiondepthwise3d.h"
 #include "testutil.h"
 
 static int test_deconvolutiondepthwise3d(int w, int h, int d, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, int output_pad_right, int output_pad_bottom, int output_pad_behind, int output_w, int output_h, int output_d)
@@ -52,7 +51,7 @@ static int test_deconvolutiondepthwise3d(int w, int h, int d, int c, int outch,
     weights[0] = RandomMat(outch / group * c / group * kernel * kernel * kernel * group);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::DeconvolutionDepthWise3D>("DeconvolutionDepthWise3D", pd, weights, a);
+    int ret = test_layer("DeconvolutionDepthWise3D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolutiondepthwise3d failed w=%d h=%d d=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_pad_behind=%d output_w=%d output_h=%d output_d=%d\n", w, h, d, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_pad_behind, output_w, output_h, output_d);
diff --git a/tests/test_deconvolutiondepthwise_1.cpp b/tests/test_deconvolutiondepthwise_1.cpp
index 02fc8f97351..adbd8155660 100644
--- a/tests/test_deconvolutiondepthwise_1.cpp
+++ b/tests/test_deconvolutiondepthwise_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolutiondepthwise.h"
 #include "testutil.h"
 
 static int test_deconvolutiondepthwise_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, int output_pad_right, int output_pad_bottom, int output_w, int output_h)
@@ -55,7 +54,7 @@ static int test_deconvolutiondepthwise_dynamic(int w, int h, int c, int outch, i
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::DeconvolutionDepthWise>("DeconvolutionDepthWise", pd, weights, as);
+    int ret = test_layer("DeconvolutionDepthWise", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolutiondepthwise_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
diff --git a/tests/test_deepcopy.cpp b/tests/test_deepcopy.cpp
index 1b04733618d..ad21a007d90 100644
--- a/tests/test_deepcopy.cpp
+++ b/tests/test_deepcopy.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deepcopy.h"
 #include "testutil.h"
 
 static int test_deepcopy(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_deepcopy(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::DeepCopy>("DeepCopy", pd, weights, a);
+    int ret = test_layer("DeepCopy", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deepcopy failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_deformableconv2d.cpp b/tests/test_deformableconv2d.cpp
index c6f3443ef1b..2274978c255 100644
--- a/tests/test_deformableconv2d.cpp
+++ b/tests/test_deformableconv2d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deformableconv2d.h"
 #include "testutil.h"
 
 static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -48,7 +47,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         weights[1] = RandomMat(outch);
 
     float epsilon = 0.001;
-    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -67,7 +66,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -87,7 +86,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_deformableconv2d_1.cpp b/tests/test_deformableconv2d_1.cpp
index 4a97034b47b..134c4e9b64f 100644
--- a/tests/test_deformableconv2d_1.cpp
+++ b/tests/test_deformableconv2d_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deformableconv2d.h"
 #include "testutil.h"
 
 static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -48,7 +47,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         weights[1] = RandomMat(outch);
 
     float epsilon = 0.001;
-    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -67,7 +66,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -87,7 +86,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_deformableconv2d_2.cpp b/tests/test_deformableconv2d_2.cpp
index 70b059d2b93..42ca21765f4 100644
--- a/tests/test_deformableconv2d_2.cpp
+++ b/tests/test_deformableconv2d_2.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deformableconv2d.h"
 #include "testutil.h"
 
 static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -48,7 +47,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         weights[1] = RandomMat(outch);
 
     float epsilon = 0.001;
-    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -67,7 +66,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -87,7 +86,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_deformableconv2d_3.cpp b/tests/test_deformableconv2d_3.cpp
index 26b726b2b2e..e16301fd805 100644
--- a/tests/test_deformableconv2d_3.cpp
+++ b/tests/test_deformableconv2d_3.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deformableconv2d.h"
 #include "testutil.h"
 
 static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -48,7 +47,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         weights[1] = RandomMat(outch);
 
     float epsilon = 0.001;
-    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -67,7 +66,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -87,7 +86,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_deformableconv2d_4.cpp b/tests/test_deformableconv2d_4.cpp
index eca9f289dec..1981c762b9d 100644
--- a/tests/test_deformableconv2d_4.cpp
+++ b/tests/test_deformableconv2d_4.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deformableconv2d.h"
 #include "testutil.h"
 
 static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -48,7 +47,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         weights[1] = RandomMat(outch);
 
     float epsilon = 0.001;
-    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_dequantize.cpp b/tests/test_dequantize.cpp
index 803fbd70c37..ca05059fa45 100644
--- a/tests/test_dequantize.cpp
+++ b/tests/test_dequantize.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/dequantize.h"
 #include "testutil.h"
 
 static int test_dequantize(const ncnn::Mat& a, int scale_data_size, int bias_data_size)
@@ -27,7 +26,7 @@ static int test_dequantize(const ncnn::Mat& a, int scale_data_size, int bias_dat
         weights[1] = RandomMat(bias_data_size);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING;
-    int ret = test_layer<ncnn::Dequantize>("Dequantize", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("Dequantize", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_dequantize failed a.dims=%d a=(%d %d %d) scale_data_size=%d bias_data_size=%d\n", a.dims, a.w, a.h, a.c, scale_data_size, bias_data_size);
@@ -48,7 +47,7 @@ static int test_dequantize_pack8(const ncnn::Mat& a, int scale_data_size, int bi
         weights[1] = RandomMat(bias_data_size);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACK8;
-    int ret = test_layer<ncnn::Dequantize>("Dequantize", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("Dequantize", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_dequantize_pack8 failed a.dims=%d a=(%d %d %d) scale_data_size=%d bias_data_size=%d\n", a.dims, a.w, a.h, a.c, scale_data_size, bias_data_size);
diff --git a/tests/test_diag.cpp b/tests/test_diag.cpp
index bb192d78ccc..53eefe31e4a 100644
--- a/tests/test_diag.cpp
+++ b/tests/test_diag.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/diag.h"
 #include "testutil.h"
 
 static int test_diag(const ncnn::Mat& a, int diagonal)
@@ -22,7 +21,7 @@ static int test_diag(const ncnn::Mat& a, int diagonal)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Diag>("Diag", pd, weights, a);
+    int ret = test_layer("Diag", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_diag failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_dropout.cpp b/tests/test_dropout.cpp
index e15d10bcebc..964dd8a39dd 100644
--- a/tests/test_dropout.cpp
+++ b/tests/test_dropout.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/dropout.h"
 #include "testutil.h"
 
 static int test_dropout(const ncnn::Mat& a, float scale)
@@ -22,7 +21,7 @@ static int test_dropout(const ncnn::Mat& a, float scale)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Dropout>("Dropout", pd, weights, a);
+    int ret = test_layer("Dropout", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_dropout failed a.dims=%d a=(%d %d %d) scale=%f\n", a.dims, a.w, a.h, a.c, scale);
diff --git a/tests/test_einsum.cpp b/tests/test_einsum.cpp
index c1df4747142..a189061dfbc 100644
--- a/tests/test_einsum.cpp
+++ b/tests/test_einsum.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/einsum.h"
 #include "testutil.h"
 
 static int test_einsum(const std::vector<ncnn::Mat>& a, const std::string& equation)
@@ -28,7 +27,7 @@ static int test_einsum(const std::vector<ncnn::Mat>& a, const std::string& equat
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Einsum>("Einsum", pd, weights, a);
+    int ret = test_layer("Einsum", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_einsum failed a[0].dims=%d a[0]=(%d %d %d) equation=%s\n", a[0].dims, a[0].w, a[0].h, a[0].c, equation.c_str());
diff --git a/tests/test_eltwise.cpp b/tests/test_eltwise.cpp
index 25da0196cd0..84ddfcf1359 100644
--- a/tests/test_eltwise.cpp
+++ b/tests/test_eltwise.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/eltwise.h"
 #include "testutil.h"
 
 static void print_float_array(const ncnn::Mat& a)
@@ -33,7 +32,7 @@ static int test_eltwise(const std::vector<ncnn::Mat>& a, int op_type, const ncnn
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Eltwise>("Eltwise", pd, weights, a);
+    int ret = test_layer("Eltwise", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_eltwise failed a[0].dims=%d a[0]=(%d %d %d %d) op_type=%d", a[0].dims, a[0].w, a[0].h, a[0].d, a[0].c, op_type);
diff --git a/tests/test_elu.cpp b/tests/test_elu.cpp
index a8736a3efad..cd78f846d25 100644
--- a/tests/test_elu.cpp
+++ b/tests/test_elu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/elu.h"
 #include "testutil.h"
 
 static int test_elu(const ncnn::Mat& a)
@@ -23,7 +22,7 @@ static int test_elu(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ELU>("ELU", pd, weights, a);
+    int ret = test_layer("ELU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_elu failed a.dims=%d a=(%d %d %d %d) alpha=%f\n", a.dims, a.w, a.h, a.d, a.c, alpha);
diff --git a/tests/test_erf.cpp b/tests/test_erf.cpp
index cc1102c8b97..454d13c1a37 100644
--- a/tests/test_erf.cpp
+++ b/tests/test_erf.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/erf.h"
 #include "testutil.h"
 
 static int test_erf(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_erf(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Erf>("Erf", pd, weights, a);
+    int ret = test_layer("Erf", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_erf failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_expanddims.cpp b/tests/test_expanddims.cpp
index d05d84a9d3b..129f9f261b1 100644
--- a/tests/test_expanddims.cpp
+++ b/tests/test_expanddims.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/expanddims.h"
 #include "testutil.h"
 
 static int test_expanddims(const ncnn::Mat& a, int expand_w, int expand_h, int expand_d, int expand_c)
@@ -25,7 +24,7 @@ static int test_expanddims(const ncnn::Mat& a, int expand_w, int expand_h, int e
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ExpandDims>("ExpandDims", pd, weights, a);
+    int ret = test_layer("ExpandDims", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_expanddims failed a.dims=%d a=(%d %d %d %d) expand_w=%d expand_h=%d expand_d=%d expand_c=%d\n", a.dims, a.w, a.h, a.d, a.c, expand_w, expand_h, expand_d, expand_c);
@@ -91,7 +90,7 @@ static int test_expanddims_axes(const ncnn::Mat& a, const ncnn::Mat& axes)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ExpandDims>("ExpandDims", pd, weights, a);
+    int ret = test_layer("ExpandDims", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_expanddims_axes failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_flatten.cpp b/tests/test_flatten.cpp
index 654347b1e7e..c4fb6e35a53 100644
--- a/tests/test_flatten.cpp
+++ b/tests/test_flatten.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/flatten.h"
 #include "testutil.h"
 
 static int test_flatten(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_flatten(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Flatten>("Flatten", pd, weights, a);
+    int ret = test_layer("Flatten", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_flatten failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
@@ -60,7 +59,7 @@ static int test_flatten_int8(const ncnn::Mat& a)
     std::vector<ncnn::Mat> weights(0);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::Flatten>("Flatten", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("Flatten", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_flatten_int8 failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_fold.cpp b/tests/test_fold.cpp
index 11a38428fdd..cdb6dc88ce1 100644
--- a/tests/test_fold.cpp
+++ b/tests/test_fold.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/fold.h"
 #include "testutil.h"
 
 static int test_fold(int w, int h, int outw, int outh, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_w, int pad_h)
@@ -33,7 +32,7 @@ static int test_fold(int w, int h, int outw, int outh, int kernel_w, int kernel_
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Fold>("Fold", pd, weights, a);
+    int ret = test_layer("Fold", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_fold failed w=%d h=%d outw=%d outh=%d kernel=%d,%d dilation=%d,%d stride=%d,%d pad=%d,%d\n", w, h, outw, outh, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_w, pad_h);
diff --git a/tests/test_gelu.cpp b/tests/test_gelu.cpp
index fa8f176b3cd..4bcdd6a93ce 100644
--- a/tests/test_gelu.cpp
+++ b/tests/test_gelu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/gelu.h"
 #include "testutil.h"
 
 static int test_gelu(const ncnn::Mat& a, bool fast_gelu)
@@ -22,7 +21,7 @@ static int test_gelu(const ncnn::Mat& a, bool fast_gelu)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::GELU>("GELU", pd, weights, a);
+    int ret = test_layer("GELU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gelu failed a.dims=%d a=(%d %d %d %d) fast_gelu=%s\n", a.dims, a.w, a.h, a.d, a.c, fast_gelu ? "true" : "false");
diff --git a/tests/test_gemm.cpp b/tests/test_gemm.cpp
index 09152057cb2..c2900e9ac61 100644
--- a/tests/test_gemm.cpp
+++ b/tests/test_gemm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/gemm.h"
 #include "testutil.h"
 
 static int test_gemm(int M, int N, int K, float alpha, int transA, int transB, int output_transpose, int constantA, int constantB, int output_N1M = 0)
@@ -50,7 +49,7 @@ static int test_gemm(int M, int N, int K, float alpha, int transA, int transB, i
         Randomize(a[i]);
     }
 
-    int ret = test_layer<ncnn::Gemm>("Gemm", pd, weights, a);
+    int ret = test_layer("Gemm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gemm failed M=%d N=%d K=%d alpha=%f transA=%d transB=%d output_transpose=%d constantA=%d constantB=%d output_N1M=%d\n", M, N, K, alpha, transA, transB, output_transpose, constantA, constantB, output_N1M);
@@ -128,7 +127,7 @@ static int test_gemm_bias(int M, int N, int K, const ncnn::Mat& C, float alpha,
         Randomize(a[i]);
     }
 
-    int ret = test_layer<ncnn::Gemm>("Gemm", pd, weights, a);
+    int ret = test_layer("Gemm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gemm_bias failed M=%d N=%d K=%d C.dims=%d C=(%d %d %d) alpha=%f beta=%f transA=%d transB=%d output_transpose=%d constantA=%d constantB=%d constantC=%d\n", M, N, K, C.dims, C.w, C.h, C.c, alpha, beta, transA, transB, output_transpose, constantA, constantB, constantC);
diff --git a/tests/test_gemm_1.cpp b/tests/test_gemm_1.cpp
index 5d5fcb3bff0..59a0c825627 100644
--- a/tests/test_gemm_1.cpp
+++ b/tests/test_gemm_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/gemm.h"
 #include "testutil.h"
 
 static int test_gemm(int M, int N, int K, int TILE_M, int TILE_N, int TILE_K, float alpha, int transA, int transB, int output_transpose)
@@ -37,7 +36,7 @@ static int test_gemm(int M, int N, int K, int TILE_M, int TILE_N, int TILE_K, fl
     Randomize(a[0]);
     Randomize(a[1]);
 
-    int ret = test_layer<ncnn::Gemm>("Gemm", pd, weights, a);
+    int ret = test_layer("Gemm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gemm failed M=%d N=%d K=%d TILE_M=%d TILE_N=%d TILE_K=%d alpha=%f transA=%d transB=%d output_transpose=%d\n", M, N, K, TILE_M, TILE_N, TILE_K, alpha, transA, transB, output_transpose);
diff --git a/tests/test_glu.cpp b/tests/test_glu.cpp
index 58555aa5357..3313b4f534c 100644
--- a/tests/test_glu.cpp
+++ b/tests/test_glu.cpp
@@ -10,7 +10,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/glu.h"
 #include "testutil.h"
 
 static int test_glu(const ncnn::Mat& a, int axis)
@@ -20,7 +19,7 @@ static int test_glu(const ncnn::Mat& a, int axis)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::GLU>("GLU", pd, weights, a);
+    int ret = test_layer("GLU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_glu failed a.dims=%d a=(%d %d %d) axis=%d\n", a.dims, a.w, a.h, a.c, axis);
diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp
index 0e384115352..438b6218d8f 100644
--- a/tests/test_gridsample.cpp
+++ b/tests/test_gridsample.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/gridsample.h"
 #include "testutil.h"
 
 static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample_type, int padding_mode, int align_corner, int permute_fusion)
@@ -29,7 +28,7 @@ static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample
     as[0] = a;
     as[1] = grid;
 
-    int ret = test_layer<ncnn::GridSample>("GridSample", pd, weights, as);
+    int ret = test_layer("GridSample", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_gridsample failed a.dims=%d a=(%d %d %d %d) grid.dims=%d grid=(%d %d %d %d) sample_type=%d padding_mode=%d align_corner=%d permute_fusion=%d",
diff --git a/tests/test_groupnorm.cpp b/tests/test_groupnorm.cpp
index cebb85617a5..0fea0988e62 100644
--- a/tests/test_groupnorm.cpp
+++ b/tests/test_groupnorm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/groupnorm.h"
 #include "testutil.h"
 
 static int test_groupnorm(const ncnn::Mat& a, int group, float eps, int affine)
@@ -37,7 +36,7 @@ static int test_groupnorm(const ncnn::Mat& a, int group, float eps, int affine)
     weights[0] = RandomMat(channels);
     weights[1] = RandomMat(channels);
 
-    int ret = test_layer<ncnn::GroupNorm>("GroupNorm", pd, weights, a);
+    int ret = test_layer("GroupNorm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_groupnorm failed a.dims=%d a=(%d %d %d) group=%d eps=%f\n", a.dims, a.w, a.h, a.c, group, eps);
diff --git a/tests/test_gru.cpp b/tests/test_gru.cpp
index 006e544873b..487daeb3a27 100644
--- a/tests/test_gru.cpp
+++ b/tests/test_gru.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/gru.h"
 #include "testutil.h"
 
 static int test_gru(const ncnn::Mat& a, int outch, int direction)
@@ -30,7 +29,7 @@ static int test_gru(const ncnn::Mat& a, int outch, int direction)
     weights[1] = RandomMat(outch * 4 * num_directions);
     weights[2] = RandomMat(outch * outch * 3 * num_directions);
 
-    int ret = test_layer<ncnn::GRU>("GRU", pd, weights, a);
+    int ret = test_layer("GRU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gru failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -61,7 +60,7 @@ int test_gru_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
     as[0] = a;
     as[1] = hidden;
 
-    int ret = test_layer<ncnn::GRU>("GRU", pd, weights, as, 2);
+    int ret = test_layer("GRU", pd, weights, as, 2);
     if (ret != 0)
     {
         fprintf(stderr, "test_gru_layer_with_hidden failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -92,7 +91,7 @@ int test_gru_layer_with_hidden_input(const ncnn::Mat& a, int outch, int directio
     as[0] = a;
     as[1] = hidden;
 
-    int ret = test_layer<ncnn::GRU>("GRU", pd, weights, as, 1);
+    int ret = test_layer("GRU", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_gru_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -119,7 +118,7 @@ int test_gru_layer_with_hidden_output(const ncnn::Mat& a, int outch, int directi
     std::vector<ncnn::Mat> as(1);
     as[0] = a;
 
-    int ret = test_layer<ncnn::GRU>("GRU", pd, weights, as, 2);
+    int ret = test_layer("GRU", pd, weights, as, 2);
     if (ret != 0)
     {
         fprintf(stderr, "test_gru_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
diff --git a/tests/test_hardsigmoid.cpp b/tests/test_hardsigmoid.cpp
index 8941b661788..d1318ef4962 100644
--- a/tests/test_hardsigmoid.cpp
+++ b/tests/test_hardsigmoid.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/hardsigmoid.h"
 #include "testutil.h"
 
 static int test_hardsigmoid(const ncnn::Mat& a, float alpha, float beta)
@@ -23,7 +22,7 @@ static int test_hardsigmoid(const ncnn::Mat& a, float alpha, float beta)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::HardSigmoid>("HardSigmoid", pd, weights, a);
+    int ret = test_layer("HardSigmoid", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_hardsigmoid failed a.dims=%d a=(%d %d %d) alpha=%f beta=%f\n", a.dims, a.w, a.h, a.c, alpha, beta);
diff --git a/tests/test_hardswish.cpp b/tests/test_hardswish.cpp
index a6ca76da8ec..0f3352652de 100644
--- a/tests/test_hardswish.cpp
+++ b/tests/test_hardswish.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/hardswish.h"
 #include "testutil.h"
 
 static int test_hardswish(const ncnn::Mat& a, float alpha, float beta)
@@ -23,7 +22,7 @@ static int test_hardswish(const ncnn::Mat& a, float alpha, float beta)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::HardSwish>("HardSwish", pd, weights, a);
+    int ret = test_layer("HardSwish", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_hardswish failed a.dims=%d a=(%d %d %d) alpha=%f beta=%f\n", a.dims, a.w, a.h, a.c, alpha, beta);
diff --git a/tests/test_innerproduct.cpp b/tests/test_innerproduct.cpp
index a9ec260db68..298ad6aa078 100644
--- a/tests/test_innerproduct.cpp
+++ b/tests/test_innerproduct.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/innerproduct.h"
 #include "testutil.h"
 
 static int test_innerproduct(const ncnn::Mat& a, int outch, int bias)
@@ -34,7 +33,7 @@ static int test_innerproduct(const ncnn::Mat& a, int outch, int bias)
     if (bias)
         weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a);
+    int ret = test_layer("InnerProduct", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_innerproduct failed a.dims=%d a=(%d %d %d) outch=%d bias=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.c, outch, bias, activation_type, activation_params[0], activation_params[1]);
@@ -122,7 +121,7 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias)
     }
 
     int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a, 0.001f, 0, flag);
+    int ret = test_layer("InnerProduct", pd, weights, a, 0.001f, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_innerproduct_int8 failed a.dims=%d a=(%d %d %d) outch=%d bias=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.c, outch, bias, activation_type, activation_params[0], activation_params[1]);
@@ -167,7 +166,7 @@ static int test_innerproduct_gemm(const ncnn::Mat& a, int outch, int bias)
     if (bias)
         weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a);
+    int ret = test_layer("InnerProduct", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_innerproduct_gemm failed a.dims=%d a=(%d %d %d) outch=%d bias=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.c, outch, bias, activation_type, activation_params[0], activation_params[1]);
@@ -234,7 +233,7 @@ static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias)
     }
 
     int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a, 0.001f, 0, flag);
+    int ret = test_layer("InnerProduct", pd, weights, a, 0.001f, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_innerproduct_gemm_int8 failed a.dims=%d a=(%d %d %d) outch=%d bias=%d\n", a.dims, a.w, a.h, a.c, outch, bias);
diff --git a/tests/test_instancenorm.cpp b/tests/test_instancenorm.cpp
index 900c351e10f..1c28dc3582c 100644
--- a/tests/test_instancenorm.cpp
+++ b/tests/test_instancenorm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/instancenorm.h"
 #include "testutil.h"
 
 static int test_instancenorm(const ncnn::Mat& a, float eps, int affine)
@@ -28,7 +27,7 @@ static int test_instancenorm(const ncnn::Mat& a, float eps, int affine)
     weights[0] = RandomMat(channels);
     weights[1] = RandomMat(channels);
 
-    int ret = test_layer<ncnn::InstanceNorm>("InstanceNorm", pd, weights, a);
+    int ret = test_layer("InstanceNorm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_instancenorm failed a.dims=%d a=(%d %d %d) eps=%f affine=%d\n", a.dims, a.w, a.h, a.c, eps, affine);
diff --git a/tests/test_interp.cpp b/tests/test_interp.cpp
index a2b0620f869..b2c646cf84e 100644
--- a/tests/test_interp.cpp
+++ b/tests/test_interp.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/interp.h"
 #include "testutil.h"
 
 static int test_interp(const ncnn::Mat& a, int resize_type, float height_scale, float width_scale, int output_height, int output_width)
@@ -26,7 +25,7 @@ static int test_interp(const ncnn::Mat& a, int resize_type, float height_scale,
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, a);
+    int ret = test_layer("Interp", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp failed a.dims=%d a=(%d %d %d) resize_type=%d height_scale=%f width_scale=%f output_height=%d output_width=%d\n", a.dims, a.w, a.h, a.c, resize_type, height_scale, width_scale, output_height, output_width);
@@ -47,7 +46,7 @@ static int test_interp_ref(const ncnn::Mat& a, int resize_type, int output_heigh
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, as);
+    int ret = test_layer("Interp", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp_ref failed a.dims=%d a=(%d %d %d) resize_type=%d output_height=%d output_width=%d\n", a.dims, a.w, a.h, a.c, resize_type, output_height, output_width);
@@ -68,7 +67,7 @@ static int test_interp_align_corner(const ncnn::Mat& a, int resize_type, float h
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, a);
+    int ret = test_layer("Interp", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp failed a.dims=%d a=(%d %d %d) resize_type=%d height_scale=%f width_scale=%f output_height=%d output_width=%d align_corner=%d\n", a.dims, a.w, a.h, a.c, resize_type, height_scale, width_scale, output_height, output_width, align_corner);
@@ -88,7 +87,7 @@ static int test_interp(const ncnn::Mat& a, int resize_type, float width_scale, i
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, a);
+    int ret = test_layer("Interp", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp failed a.dims=%d a=(%d %d %d) resize_type=%d width_scale=%f output_width=%d\n", a.dims, a.w, a.h, a.c, resize_type, width_scale, output_width);
@@ -109,7 +108,7 @@ static int test_interp_ref(const ncnn::Mat& a, int resize_type, int output_width
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, as);
+    int ret = test_layer("Interp", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp_ref failed a.dims=%d a=(%d %d %d) resize_type=%d output_width=%d\n", a.dims, a.w, a.h, a.c, resize_type, output_width);
@@ -130,7 +129,7 @@ static int test_interp_align_corner(const ncnn::Mat& a, int resize_type, float w
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, a);
+    int ret = test_layer("Interp", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp failed a.dims=%d a=(%d %d %d) resize_type=%d width_scale=%f output_width=%d align_corner=%d\n", a.dims, a.w, a.h, a.c, resize_type, width_scale, output_width, align_corner);
diff --git a/tests/test_layernorm.cpp b/tests/test_layernorm.cpp
index fefb37c8a4c..b6d6684ab4c 100644
--- a/tests/test_layernorm.cpp
+++ b/tests/test_layernorm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/layernorm.h"
 #include "testutil.h"
 
 static int test_layernorm(const ncnn::Mat& a, int affine_size, float eps, int affine)
@@ -26,7 +25,7 @@ static int test_layernorm(const ncnn::Mat& a, int affine_size, float eps, int af
     weights[0] = RandomMat(affine_size);
     weights[1] = RandomMat(affine_size);
 
-    int ret = test_layer<ncnn::LayerNorm>("LayerNorm", pd, weights, a);
+    int ret = test_layer("LayerNorm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_layernorm failed a.dims=%d a=(%d %d %d) affine_size=%d eps=%f affine=%d\n", a.dims, a.w, a.h, a.c, affine_size, eps, affine);
diff --git a/tests/test_lrn.cpp b/tests/test_lrn.cpp
index 7f0122d0651..6fe5d0e6a1f 100644
--- a/tests/test_lrn.cpp
+++ b/tests/test_lrn.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/lrn.h"
 #include "testutil.h"
 
 static int test_lrn(const ncnn::Mat& a, int region_type, int local_size, float alpha, float beta, float bias)
@@ -26,7 +25,7 @@ static int test_lrn(const ncnn::Mat& a, int region_type, int local_size, float a
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::LRN>("LRN", pd, weights, a);
+    int ret = test_layer("LRN", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_lrn failed a.dims=%d a=(%d %d %d) region_type=%d local_size=%d alpha=%f beta=%f bias=%f\n", a.dims, a.w, a.h, a.c, region_type, local_size, alpha, beta, bias);
diff --git a/tests/test_lstm.cpp b/tests/test_lstm.cpp
index fb76ad0fbd7..8b5788a86dc 100644
--- a/tests/test_lstm.cpp
+++ b/tests/test_lstm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/lstm.h"
 #include "testutil.h"
 
 static int test_lstm(const ncnn::Mat& a, int outch, int direction, int hidden_size = 0)
@@ -37,7 +36,7 @@ static int test_lstm(const ncnn::Mat& a, int outch, int direction, int hidden_si
         weights[3] = RandomMat(hidden_size * outch * num_directions);
     }
 
-    int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, a);
+    int ret = test_layer("LSTM", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_lstm failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
@@ -79,7 +78,7 @@ int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction, in
     as[1] = hidden;
     as[2] = cell;
 
-    int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 3);
+    int ret = test_layer("LSTM", pd, weights, as, 3);
     if (ret != 0)
     {
         fprintf(stderr, "test_lstm_layer_with_hidden failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
@@ -121,7 +120,7 @@ int test_lstm_layer_with_hidden_input(const ncnn::Mat& a, int outch, int directi
     as[1] = hidden;
     as[2] = cell;
 
-    int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 1);
+    int ret = test_layer("LSTM", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_lstm_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
@@ -155,7 +154,7 @@ int test_lstm_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direct
     std::vector<ncnn::Mat> as(1);
     as[0] = a;
 
-    int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 3);
+    int ret = test_layer("LSTM", pd, weights, as, 3);
     if (ret != 0)
     {
         fprintf(stderr, "test_lstm_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
diff --git a/tests/test_matmul.cpp b/tests/test_matmul.cpp
index 34b4aad0354..0ca17d10825 100644
--- a/tests/test_matmul.cpp
+++ b/tests/test_matmul.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/matmul.h"
 #include "testutil.h"
 
 static int test_matmul(const ncnn::Mat& a, const ncnn::Mat& b)
@@ -26,7 +25,7 @@ static int test_matmul(const ncnn::Mat& a, const ncnn::Mat& b)
     as[0] = a;
     as[1] = b;
 
-    int ret = test_layer<ncnn::MatMul>("MatMul", pd, weights, as);
+    int ret = test_layer("MatMul", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_matmul failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c);
@@ -46,7 +45,7 @@ static int test_matmul_transb(const ncnn::Mat& a, const ncnn::Mat& b)
     as[0] = a;
     as[1] = b;
 
-    int ret = test_layer<ncnn::MatMul>("MatMul", pd, weights, as);
+    int ret = test_layer("MatMul", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_matmul_transb failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c);
diff --git a/tests/test_memorydata.cpp b/tests/test_memorydata.cpp
index 25b2bd0b1fb..ff15fab3582 100644
--- a/tests/test_memorydata.cpp
+++ b/tests/test_memorydata.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/memorydata.h"
 #include "testutil.h"
 
 static int test_memorydata(const ncnn::Mat& a)
@@ -27,7 +26,7 @@ static int test_memorydata(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> as(0);
 
-    int ret = test_layer<ncnn::MemoryData>("MemoryData", pd, weights, as, 1);
+    int ret = test_layer("MemoryData", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_memorydata failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_mish.cpp b/tests/test_mish.cpp
index ffbc923f65c..7bf03e19f63 100644
--- a/tests/test_mish.cpp
+++ b/tests/test_mish.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/mish.h"
 #include "testutil.h"
 
 static int test_mish(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_mish(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Mish>("Mish", pd, weights, a);
+    int ret = test_layer("Mish", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_mish failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp
index e243fb910a9..ad29c6b98b0 100644
--- a/tests/test_multiheadattention.cpp
+++ b/tests/test_multiheadattention.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/multiheadattention.h"
 #include "testutil.h"
 
 static int test_multiheadattention(const ncnn::Mat& q, const ncnn::Mat& k, const ncnn::Mat& v, int num_heads, int kdim, int vdim, int attn_mask)
@@ -49,7 +48,7 @@ static int test_multiheadattention(const ncnn::Mat& q, const ncnn::Mat& k, const
 
     float epsilon = 0.005;
 
-    int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as, 1, epsilon);
+    int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_multiheadattention failed q=(%d %d) k=(%d %d) v=(%d %d) num_heads=%d kdim=%d vdim=%d attn_mask=%d\n", q.w, q.h, k.w, k.h, v.w, v.h, num_heads, kdim, vdim, attn_mask);
@@ -85,7 +84,7 @@ static int test_multiheadattention_samekv(const ncnn::Mat& q, const ncnn::Mat& k
 
     float epsilon = 0.005;
 
-    int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as, 1, epsilon);
+    int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_multiheadattention_samekv failed q=(%d %d) kv=(%d %d) num_heads=%d kvdim=%d\n", q.w, q.h, kv.w, kv.h, num_heads, kvdim);
@@ -118,7 +117,7 @@ static int test_multiheadattention_sameqkv(const ncnn::Mat& a, int num_heads)
 
     float epsilon = 0.005;
 
-    int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as, 1, epsilon);
+    int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_multiheadattention_sameqkv failed a=(%d %d) num_heads=%d\n", a.w, a.h, num_heads);
diff --git a/tests/test_noop.cpp b/tests/test_noop.cpp
index ad7e9552ad3..8484a151e79 100644
--- a/tests/test_noop.cpp
+++ b/tests/test_noop.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/noop.h"
 #include "testutil.h"
 
 static int test_noop(const ncnn::Mat& a)
@@ -24,7 +23,7 @@ static int test_noop(const ncnn::Mat& a)
     std::vector<ncnn::Mat> as(1);
     as[0] = a;
 
-    int ret = test_layer<ncnn::Noop>("Noop", pd, weights, as, 1);
+    int ret = test_layer("Noop", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_noop failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_normalize.cpp b/tests/test_normalize.cpp
index f1e56b659a5..d20d19ef69c 100644
--- a/tests/test_normalize.cpp
+++ b/tests/test_normalize.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/normalize.h"
 #include "testutil.h"
 
 static int test_normalize(const ncnn::Mat& a, int across_spatial, int across_channel, int channel_shared, float eps, int eps_mode)
@@ -30,7 +29,7 @@ static int test_normalize(const ncnn::Mat& a, int across_spatial, int across_cha
     std::vector<ncnn::Mat> weights(1);
     weights[0] = RandomMat(scale_data_size);
 
-    int ret = test_layer<ncnn::Normalize>("Normalize", pd, weights, a);
+    int ret = test_layer("Normalize", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_normalize failed a.dims=%d a=(%d %d %d) across_spatial=%d across_channel=%d channel_shared=%d eps=%f eps_mode=%d\n", a.dims, a.w, a.h, a.c, across_spatial, across_channel, channel_shared, eps, eps_mode);
diff --git a/tests/test_packing.cpp b/tests/test_packing.cpp
index 5de21f7bbb5..da921dc50ff 100644
--- a/tests/test_packing.cpp
+++ b/tests/test_packing.cpp
@@ -12,9 +12,37 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/packing.h"
 #include "testutil.h"
 
+static int packing_cpu_naive(const ncnn::Mat& a, ncnn::Mat& b, int out_elempack)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, out_elempack);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    ncnn::Option opt;
+    opt.num_threads = 1;
+
+    ncnn::Layer* op = ncnn::create_layer_naive("Packing");
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    op->create_pipeline(opt);
+
+    op->forward(a, b, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
 static int test_packing_cpu_fp32(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
     ncnn::ParamDict pd;
@@ -30,7 +58,7 @@ static int test_packing_cpu_fp32(const ncnn::Mat& a, int in_elempack, int out_el
     opt.use_fp16_arithmetic = false;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Packing");
+    ncnn::Layer* op = ncnn::create_layer_cpu("Packing");
 
     op->load_param(pd);
 
@@ -44,7 +72,7 @@ static int test_packing_cpu_fp32(const ncnn::Mat& a, int in_elempack, int out_el
     ncnn::convert_packing(a, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat c;
     op->forward(ap, c, opt);
@@ -77,7 +105,7 @@ static int test_packing_cpu_fp16(const ncnn::Mat& a, int in_elempack, int out_el
     opt.use_fp16_arithmetic = true;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Packing");
+    ncnn::Layer* op = ncnn::create_layer_cpu("Packing");
 
     if (!op->support_fp16_storage)
     {
@@ -100,7 +128,7 @@ static int test_packing_cpu_fp16(const ncnn::Mat& a, int in_elempack, int out_el
     ncnn::convert_packing(a16, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat c;
     op->forward(ap, c, opt);
@@ -136,7 +164,7 @@ static int test_packing_cpu_int8(const ncnn::Mat& a, int in_elempack, int out_el
     opt.use_fp16_arithmetic = false;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Packing");
+    ncnn::Layer* op = ncnn::create_layer_cpu("Packing");
 
     op->load_param(pd);
 
@@ -156,7 +184,7 @@ static int test_packing_cpu_int8(const ncnn::Mat& a, int in_elempack, int out_el
     ncnn::convert_packing(a8, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat c;
     op->forward(ap, c, opt);
@@ -189,7 +217,6 @@ static int test_packing_cpu(const ncnn::Mat& a, int in_elempack, int out_elempac
 }
 
 #if NCNN_VULKAN
-#include "layer/vulkan/packing_vulkan.h"
 
 static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
@@ -227,7 +254,7 @@ static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Packing");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Packing");
 
     op->vkdev = vkdev;
 
@@ -243,7 +270,7 @@ static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_
     ncnn::convert_packing(a, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat d;
 
@@ -314,7 +341,7 @@ static int test_packing_gpu_image(const ncnn::Mat& a, int in_elempack, int out_e
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Packing");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Packing");
 
     op->vkdev = vkdev;
 
@@ -330,7 +357,7 @@ static int test_packing_gpu_image(const ncnn::Mat& a, int in_elempack, int out_e
     ncnn::convert_packing(a, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat d;
 
@@ -367,15 +394,6 @@ static int test_packing_gpu_image(const ncnn::Mat& a, int in_elempack, int out_e
 
 static int test_packing_gpu_buffer2image(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
-    ncnn::ParamDict pd;
-    pd.set(0, out_elempack);
-    pd.set(2, 1); // cast_type_from
-    pd.set(3, 1); // cast_type_to
-    pd.set(4, 0); // storage_type_from
-    pd.set(5, 1); // storage_type_to
-
-    std::vector<ncnn::Mat> weights(0);
-
     ncnn::Option opt;
     opt.num_threads = 1;
     opt.use_vulkan_compute = true;
@@ -401,23 +419,11 @@ static int test_packing_gpu_buffer2image(const ncnn::Mat& a, int in_elempack, in
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Packing_vulkan* op = new ncnn::Packing_vulkan;
-
-    op->vkdev = vkdev;
-
-    op->load_param(pd);
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    op->create_pipeline(opt);
-
     ncnn::Mat ap;
     ncnn::convert_packing(a, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat d;
 
@@ -429,17 +435,13 @@ static int test_packing_gpu_buffer2image(const ncnn::Mat& a, int in_elempack, in
     cmd.record_clone(ap, a_gpu, opt);
 
     ncnn::VkImageMat d_gpu;
-    op->forward(a_gpu, d_gpu, cmd, opt);
+    vkdev->convert_packing(a_gpu, d_gpu, out_elempack, cmd, opt);
 
     // download
     cmd.record_clone(d_gpu, d, opt);
 
     cmd.submit_and_wait();
 
-    op->destroy_pipeline(opt);
-
-    delete op;
-
     vkdev->reclaim_blob_allocator(blob_vkallocator);
     vkdev->reclaim_staging_allocator(staging_vkallocator);
 
@@ -454,15 +456,6 @@ static int test_packing_gpu_buffer2image(const ncnn::Mat& a, int in_elempack, in
 
 static int test_packing_gpu_image2buffer(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
-    ncnn::ParamDict pd;
-    pd.set(0, out_elempack);
-    pd.set(2, 1); // cast_type_from
-    pd.set(3, 1); // cast_type_to
-    pd.set(4, 1); // storage_type_from
-    pd.set(5, 0); // storage_type_to
-
-    std::vector<ncnn::Mat> weights(0);
-
     ncnn::Option opt;
     opt.num_threads = 1;
     opt.use_vulkan_compute = true;
@@ -488,23 +481,11 @@ static int test_packing_gpu_image2buffer(const ncnn::Mat& a, int in_elempack, in
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Packing_vulkan* op = new ncnn::Packing_vulkan;
-
-    op->vkdev = vkdev;
-
-    op->load_param(pd);
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    op->create_pipeline(opt);
-
     ncnn::Mat ap;
     ncnn::convert_packing(a, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat d;
 
@@ -516,17 +497,13 @@ static int test_packing_gpu_image2buffer(const ncnn::Mat& a, int in_elempack, in
     cmd.record_clone(ap, a_gpu, opt);
 
     ncnn::VkMat d_gpu;
-    op->forward(a_gpu, d_gpu, cmd, opt);
+    vkdev->convert_packing(a_gpu, d_gpu, out_elempack, cmd, opt);
 
     // download
     cmd.record_clone(d_gpu, d, opt);
 
     cmd.submit_and_wait();
 
-    op->destroy_pipeline(opt);
-
-    delete op;
-
     vkdev->reclaim_blob_allocator(blob_vkallocator);
     vkdev->reclaim_staging_allocator(staging_vkallocator);
 
diff --git a/tests/test_padding.cpp b/tests/test_padding.cpp
index be192069ff7..f8018781e72 100644
--- a/tests/test_padding.cpp
+++ b/tests/test_padding.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/padding.h"
 #include "testutil.h"
 
 static int test_padding(const ncnn::Mat& a, int top, int bottom, int left, int right, int front, int behind, int type, float value, int per_channel_pad_data_size)
@@ -32,7 +31,7 @@ static int test_padding(const ncnn::Mat& a, int top, int bottom, int left, int r
     if (per_channel_pad_data_size)
         weights[0] = RandomMat(per_channel_pad_data_size);
 
-    int ret = test_layer<ncnn::Padding>("Padding", pd, weights, a);
+    int ret = test_layer("Padding", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_padding failed a.dims=%d a=(%d %d %d %d) top=%d bottom=%d left=%d right=%d front=%d behind=%d type=%d value=%f per_channel_pad_data_size=%d\n", a.dims, a.w, a.h, a.d, a.c, top, bottom, left, right, front, behind, type, value, per_channel_pad_data_size);
@@ -242,7 +241,7 @@ static int test_padding_int8(const ncnn::Mat& a, int top, int bottom, int left,
         weights[0] = RandomMat(per_channel_pad_data_size);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::Padding>("Padding", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("Padding", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_padding_int8 failed a.dims=%d a=(%d %d %d %d) top=%d bottom=%d left=%d right=%d front=%d behind=%d type=%d value=%f per_channel_pad_data_size=%d\n", a.dims, a.w, a.h, a.d, a.c, top, bottom, left, right, front, behind, type, value, per_channel_pad_data_size);
diff --git a/tests/test_permute.cpp b/tests/test_permute.cpp
index e6f9c7d9d54..2793185c935 100644
--- a/tests/test_permute.cpp
+++ b/tests/test_permute.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/permute.h"
 #include "testutil.h"
 
 static int test_permute(const ncnn::Mat& a, int order_type)
@@ -22,7 +21,7 @@ static int test_permute(const ncnn::Mat& a, int order_type)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Permute>("Permute", pd, weights, a);
+    int ret = test_layer("Permute", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_permute failed a.dims=%d a=(%d %d %d %d) order_type=%d\n", a.dims, a.w, a.h, a.d, a.c, order_type);
diff --git a/tests/test_pixelshuffle.cpp b/tests/test_pixelshuffle.cpp
index 1fa04b6161e..f55c81a9f1c 100644
--- a/tests/test_pixelshuffle.cpp
+++ b/tests/test_pixelshuffle.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/pixelshuffle.h"
 #include "testutil.h"
 
 static int test_pixelshuffle(const ncnn::Mat& a, int upscale_factor, int mode)
@@ -23,7 +22,7 @@ static int test_pixelshuffle(const ncnn::Mat& a, int upscale_factor, int mode)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::PixelShuffle>("PixelShuffle", pd, weights, a);
+    int ret = test_layer("PixelShuffle", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_pixelshuffle failed a.dims=%d a=(%d %d %d) upscale_factor=%d mode=%d\n", a.dims, a.w, a.h, a.c, upscale_factor, mode);
diff --git a/tests/test_pooling.cpp b/tests/test_pooling.cpp
index df0e69e922d..01d85b80e07 100644
--- a/tests/test_pooling.cpp
+++ b/tests/test_pooling.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/pooling.h"
 #include "testutil.h"
 
 static int test_pooling(int w, int h, int c, int pooling_type, int kernel, int stride, int pad, int global_pooling, int pad_mode, int avgpool_count_include_pad, int adaptive_pooling, int out_w)
@@ -32,7 +31,7 @@ static int test_pooling(int w, int h, int c, int pooling_type, int kernel, int s
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Pooling>("Pooling", pd, weights, a);
+    int ret = test_layer("Pooling", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_pooling failed w=%d h=%d c=%d pooling_type=%d kernel=%d stride=%d pad=%d global_pooling=%d pad_mode=%d avgpool_count_include_pad=%d adaptive_pooling=%d out_w=%d\n", w, h, c, pooling_type, kernel, stride, pad, global_pooling, pad_mode, avgpool_count_include_pad, adaptive_pooling, out_w);
diff --git a/tests/test_pooling1d.cpp b/tests/test_pooling1d.cpp
index 05a7cb83327..b73c4fdb6d1 100644
--- a/tests/test_pooling1d.cpp
+++ b/tests/test_pooling1d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/pooling1d.h"
 #include "testutil.h"
 
 static int test_pooling1d(int w, int h, int pooling_type, int kernel, int stride, int pad, int global_pooling, int pad_mode, int avgpool_count_include_pad, int adaptive_pooling, int out_w)
@@ -32,7 +31,7 @@ static int test_pooling1d(int w, int h, int pooling_type, int kernel, int stride
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Pooling1D>("Pooling1D", pd, weights, a);
+    int ret = test_layer("Pooling1D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_pooling1d failed w=%d h=%d pooling_type=%d kernel=%d stride=%d pad=%d global_pooling=%d pad_mode=%d avgpool_count_include_pad=%d adaptive_pooling=%d out_w=%d\n", w, h, pooling_type, kernel, stride, pad, global_pooling, pad_mode, avgpool_count_include_pad, adaptive_pooling, out_w);
diff --git a/tests/test_pooling3d.cpp b/tests/test_pooling3d.cpp
index 870fd79a939..4296bd446c5 100644
--- a/tests/test_pooling3d.cpp
+++ b/tests/test_pooling3d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/pooling3d.h"
 #include "testutil.h"
 
 static int test_pooling3d(int w, int h, int d, int c, int pooling_type, int kernel, int stride, int pad, int global_pooling, int pad_mode, int avgpool_count_include_pad, int adaptive_pooling, int out_w)
@@ -32,7 +31,7 @@ static int test_pooling3d(int w, int h, int d, int c, int pooling_type, int kern
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Pooling3D>("Pooling3D", pd, weights, a);
+    int ret = test_layer("Pooling3D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_pooling3d failed w=%d h=%d d=%d c=%d pooling_type=%d kernel=%d stride=%d pad=%d global_pooling=%d pad_mode=%d avgpool_count_include_pad=%d adaptive_pooling=%d out_w=%d\n", w, h, d, c, pooling_type, kernel, stride, pad, global_pooling, pad_mode, avgpool_count_include_pad, adaptive_pooling, out_w);
diff --git a/tests/test_power.cpp b/tests/test_power.cpp
index da399cedaf1..550cda42f9f 100644
--- a/tests/test_power.cpp
+++ b/tests/test_power.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/power.h"
 #include "testutil.h"
 
 static int test_power(const ncnn::Mat& a)
@@ -24,7 +23,7 @@ static int test_power(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Power>("Power", pd, weights, a);
+    int ret = test_layer("Power", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_power failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_prelu.cpp b/tests/test_prelu.cpp
index 4184a288ada..02887de8162 100644
--- a/tests/test_prelu.cpp
+++ b/tests/test_prelu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/prelu.h"
 #include "testutil.h"
 
 static int test_prelu(const ncnn::Mat& a, int num_slope)
@@ -23,7 +22,7 @@ static int test_prelu(const ncnn::Mat& a, int num_slope)
     std::vector<ncnn::Mat> weights(1);
     weights[0] = RandomMat(num_slope);
 
-    int ret = test_layer<ncnn::PReLU>("PReLU", pd, weights, a);
+    int ret = test_layer("PReLU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_prelu failed a.dims=%d a=(%d %d %d) num_slope=%d\n", a.dims, a.w, a.h, a.c, num_slope);
diff --git a/tests/test_priorbox.cpp b/tests/test_priorbox.cpp
index c1c53ecaee2..ae224320966 100644
--- a/tests/test_priorbox.cpp
+++ b/tests/test_priorbox.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/priorbox.h"
 #include "testutil.h"
 
 static int test_priorbox_caffe()
@@ -51,7 +50,7 @@ static int test_priorbox_caffe()
     as[0] = RandomMat(72, 72, 1);
     as[1] = RandomMat(512, 512, 1);
 
-    int ret = test_layer<ncnn::PriorBox>("PriorBox", pd, weights, as, 1);
+    int ret = test_layer("PriorBox", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_priorbox_caffe failed\n");
@@ -98,7 +97,7 @@ static int test_priorbox_mxnet()
     std::vector<ncnn::Mat> as(1);
     as[0] = RandomMat(72, 72, 1);
 
-    int ret = test_layer<ncnn::PriorBox>("PriorBox", pd, weights, as, 1);
+    int ret = test_layer("PriorBox", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_priorbox_mxnet failed\n");
diff --git a/tests/test_quantize.cpp b/tests/test_quantize.cpp
index afc21e3f9ee..a6e67b23d46 100644
--- a/tests/test_quantize.cpp
+++ b/tests/test_quantize.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/quantize.h"
 #include "testutil.h"
 
 static int test_quantize(const ncnn::Mat& a, float scale_low, float scale_high)
@@ -37,7 +36,7 @@ static int test_quantize(const ncnn::Mat& a, float scale_low, float scale_high)
     std::vector<ncnn::Mat> weights(1);
     weights[0] = scale_data;
 
-    int ret = test_layer<ncnn::Quantize>("Quantize", pd, weights, a);
+    int ret = test_layer("Quantize", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_quantize failed a.dims=%d a=(%d %d %d) scale_low=%f scale_high=%f\n", a.dims, a.w, a.h, a.c, scale_low, scale_high);
diff --git a/tests/test_reduction.cpp b/tests/test_reduction.cpp
index 01b808c4fc6..3895d353333 100644
--- a/tests/test_reduction.cpp
+++ b/tests/test_reduction.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/reduction.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 11
@@ -86,7 +85,7 @@ static int test_reduction(const ncnn::Mat& _a, float coeff, int keepdims)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Reduction>("Reduction", pd, weights, a);
+    int ret = test_layer("Reduction", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_reduction failed a.dims=%d a=(%d %d %d %d) op_type=%d coeff=%f keepdims=%d reduce_all=1\n", a.dims, a.w, a.h, a.d, a.c, op_type, coeff, keepdims);
@@ -113,7 +112,7 @@ static int test_reduction(const ncnn::Mat& _a, float coeff, int keepdims, const
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Reduction>("Reduction", pd, weights, a);
+    int ret = test_layer("Reduction", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_reduction failed a.dims=%d a=(%d %d %d %d) op_type=%d coeff=%f keepdims=%d", a.dims, a.w, a.h, a.d, a.c, op_type, coeff, keepdims);
diff --git a/tests/test_relu.cpp b/tests/test_relu.cpp
index 26cbf00314b..8ecf293adce 100644
--- a/tests/test_relu.cpp
+++ b/tests/test_relu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/relu.h"
 #include "testutil.h"
 
 static int test_relu(const ncnn::Mat& a, float slope)
@@ -22,7 +21,7 @@ static int test_relu(const ncnn::Mat& a, float slope)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ReLU>("ReLU", pd, weights, a);
+    int ret = test_layer("ReLU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_relu failed a.dims=%d a=(%d %d %d %d) slope=%f\n", a.dims, a.w, a.h, a.d, a.c, slope);
diff --git a/tests/test_reorg.cpp b/tests/test_reorg.cpp
index c6308cdc6ce..c1c363072ed 100644
--- a/tests/test_reorg.cpp
+++ b/tests/test_reorg.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/reorg.h"
 #include "testutil.h"
 
 static int test_reorg(const ncnn::Mat& a, int stride, int mode)
@@ -23,7 +22,7 @@ static int test_reorg(const ncnn::Mat& a, int stride, int mode)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Reorg>("Reorg", pd, weights, a);
+    int ret = test_layer("Reorg", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_reorg failed a.dims=%d a=(%d %d %d) stride=%d mode=%d\n", a.dims, a.w, a.h, a.c, stride, mode);
diff --git a/tests/test_requantize.cpp b/tests/test_requantize.cpp
index 63f99c7153a..1032d529ea6 100644
--- a/tests/test_requantize.cpp
+++ b/tests/test_requantize.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/requantize.h"
 #include "testutil.h"
 
 static int test_requantize(const ncnn::Mat& a, int scale_in_data_size, int scale_out_data_size, int bias_data_size, int activation_type, float alpha, float beta)
@@ -38,7 +37,7 @@ static int test_requantize(const ncnn::Mat& a, int scale_in_data_size, int scale
     Randomize(weights[1], 10, 100);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING;
-    int ret = test_layer<ncnn::Requantize>("Requantize", pd, weights, a, 1, 0, flag);
+    int ret = test_layer("Requantize", pd, weights, a, 1, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_requantize failed a.dims=%d a=(%d %d %d) scale_in_data_size=%d scale_out_data_size=%d bias_data_size=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.c, scale_in_data_size, scale_out_data_size, bias_data_size, activation_type, activation_params[0], activation_params[1]);
@@ -81,7 +80,7 @@ static int test_requantize_pack8(const ncnn::Mat& a, int scale_in_data_size, int
     Randomize(weights[1], 10, 100);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACK8;
-    int ret = test_layer<ncnn::Requantize>("Requantize", pd, weights, a, 1, 0, flag);
+    int ret = test_layer("Requantize", pd, weights, a, 1, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_requantize_pack8 failed a.dims=%d a=(%d %d %d) scale_in_data_size=%d scale_out_data_size=%d bias_data_size=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.c, scale_in_data_size, scale_out_data_size, bias_data_size, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_reshape.cpp b/tests/test_reshape.cpp
index fd15dda4e0a..b908e941b57 100644
--- a/tests/test_reshape.cpp
+++ b/tests/test_reshape.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/reshape.h"
 #include "testutil.h"
 
 static int test_reshape(const ncnn::Mat& a, int outw, int outh, int outd, int outc)
@@ -25,7 +24,7 @@ static int test_reshape(const ncnn::Mat& a, int outw, int outh, int outd, int ou
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Reshape>("Reshape", pd, weights, a);
+    int ret = test_layer("Reshape", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_reshape failed a.dims=%d a=(%d %d %d %d) outw=%d outh=%d outd=%d outc=%d\n", a.dims, a.w, a.h, a.d, a.c, outw, outh, outd, outc);
diff --git a/tests/test_reshape_1.cpp b/tests/test_reshape_1.cpp
index 63cbbf1baf0..4f8ef8e263f 100644
--- a/tests/test_reshape_1.cpp
+++ b/tests/test_reshape_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/reshape.h"
 #include "testutil.h"
 
 static int test_reshape_permute(const ncnn::Mat& a, int outw, int outh, int outd, int outc)
@@ -26,7 +25,7 @@ static int test_reshape_permute(const ncnn::Mat& a, int outw, int outh, int outd
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Reshape>("Reshape", pd, weights, a);
+    int ret = test_layer("Reshape", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_reshape_permute failed a.dims=%d a=(%d %d %d %d) outw=%d outh=%d outd=%d outc=%d\n", a.dims, a.w, a.h, a.d, a.c, outw, outh, outd, outc);
diff --git a/tests/test_rnn.cpp b/tests/test_rnn.cpp
index 31b89a22a20..f9cb9a5d752 100644
--- a/tests/test_rnn.cpp
+++ b/tests/test_rnn.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/rnn.h"
 #include "testutil.h"
 
 static int test_rnn(const ncnn::Mat& a, int outch, int direction)
@@ -30,7 +29,7 @@ static int test_rnn(const ncnn::Mat& a, int outch, int direction)
     weights[1] = RandomMat(outch * num_directions);
     weights[2] = RandomMat(outch * outch * num_directions);
 
-    int ret = test_layer<ncnn::RNN>("RNN", pd, weights, a);
+    int ret = test_layer("RNN", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_rnn failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -61,7 +60,7 @@ int test_rnn_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
     as[0] = a;
     as[1] = hidden;
 
-    int ret = test_layer<ncnn::RNN>("RNN", pd, weights, as, 2);
+    int ret = test_layer("RNN", pd, weights, as, 2);
     if (ret != 0)
     {
         fprintf(stderr, "test_rnn_layer_with_hidden failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -92,7 +91,7 @@ int test_rnn_layer_with_hidden_input(const ncnn::Mat& a, int outch, int directio
     as[0] = a;
     as[1] = hidden;
 
-    int ret = test_layer<ncnn::RNN>("RNN", pd, weights, as, 1);
+    int ret = test_layer("RNN", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_rnn_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -119,7 +118,7 @@ int test_rnn_layer_with_hidden_output(const ncnn::Mat& a, int outch, int directi
     std::vector<ncnn::Mat> as(1);
     as[0] = a;
 
-    int ret = test_layer<ncnn::RNN>("RNN", pd, weights, as, 2);
+    int ret = test_layer("RNN", pd, weights, as, 2);
     if (ret != 0)
     {
         fprintf(stderr, "test_rnn_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
diff --git a/tests/test_roialign.cpp b/tests/test_roialign.cpp
index 709d9af76ee..1f7e99230a3 100644
--- a/tests/test_roialign.cpp
+++ b/tests/test_roialign.cpp
@@ -13,7 +13,6 @@
 // specific language governing permissions and limitations under the License.
 
 #include "layer.h"
-#include "layer/roialign.h"
 #include "testutil.h"
 
 static int test_roialign(int w, int h, int c, int pooled_width, int pooled_height, float spatial_scale, int sampling_ratio, bool aligned, int version)
@@ -37,7 +36,7 @@ static int test_roialign(int w, int h, int c, int pooled_width, int pooled_heigh
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ROIAlign>("ROIAlign", pd, weights, a);
+    int ret = test_layer("ROIAlign", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_roialign failed base_w=%d base_h=%d base_c=%d pooled_width=%d pooled_height=%d spatial_scale=%4f.3\n", w, h, c, pooled_width, pooled_height, spatial_scale);
diff --git a/tests/test_roipooling.cpp b/tests/test_roipooling.cpp
index 7eb838ca2c9..0abab3428ec 100644
--- a/tests/test_roipooling.cpp
+++ b/tests/test_roipooling.cpp
@@ -13,7 +13,6 @@
 // specific language governing permissions and limitations under the License.
 
 #include "layer.h"
-#include "layer/roipooling.h"
 #include "testutil.h"
 
 static int test_roipooling(int w, int h, int c, int pooled_width, int pooled_height, float spatial_scale)
@@ -34,7 +33,7 @@ static int test_roipooling(int w, int h, int c, int pooled_width, int pooled_hei
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ROIPooling>("ROIPooling", pd, weights, a);
+    int ret = test_layer("ROIPooling", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_roipooling failed base_w=%d base_h=%d base_c=%d pooled_width=%d pooled_height=%d spatial_scale=%4f.3\n", w, h, c, pooled_width, pooled_height, spatial_scale);
diff --git a/tests/test_scale.cpp b/tests/test_scale.cpp
index e4045a0e9e2..7b02370242d 100644
--- a/tests/test_scale.cpp
+++ b/tests/test_scale.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/scale.h"
 #include "testutil.h"
 
 static int test_scale(const ncnn::Mat& a, int bias)
@@ -31,7 +30,7 @@ static int test_scale(const ncnn::Mat& a, int bias)
     if (bias)
         weights[1] = RandomMat(scale_data_size);
 
-    int ret = test_layer<ncnn::Scale>("Scale", pd, weights, a);
+    int ret = test_layer("Scale", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_scale failed a.dims=%d a=(%d %d %d) bias=%d\n", a.dims, a.w, a.h, a.c, bias);
@@ -56,7 +55,7 @@ static int test_scale_attention(const ncnn::Mat& a)
     ab[0] = a;
     ab[1] = RandomMat(scale_data_size);
 
-    int ret = test_layer<ncnn::Scale>("Scale", pd, weights, ab, 2);
+    int ret = test_layer("Scale", pd, weights, ab, 2);
     if (ret != 0)
     {
         fprintf(stderr, "test_scale_attention failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_selu.cpp b/tests/test_selu.cpp
index 3844c94ccf1..ac55c8b4299 100644
--- a/tests/test_selu.cpp
+++ b/tests/test_selu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/selu.h"
 #include "testutil.h"
 
 static int test_selu(const ncnn::Mat& a, float alpha, float lambda)
@@ -23,7 +22,7 @@ static int test_selu(const ncnn::Mat& a, float alpha, float lambda)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::SELU>("SELU", pd, weights, a);
+    int ret = test_layer("SELU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_selu failed a.dims=%d a=(%d %d %d %d) alpha=%f lambda=%f\n", a.dims, a.w, a.h, a.d, a.c, alpha, lambda);
diff --git a/tests/test_shrink.cpp b/tests/test_shrink.cpp
index 2eef8dd0976..f1e9040980b 100644
--- a/tests/test_shrink.cpp
+++ b/tests/test_shrink.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/shrink.h"
 #include "testutil.h"
 
 static int test_shrink(const ncnn::Mat& a, float lambd, float bias)
@@ -23,7 +22,7 @@ static int test_shrink(const ncnn::Mat& a, float lambd, float bias)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Shrink>("Shrink", pd, weights, a);
+    int ret = test_layer("Shrink", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_shrink failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_shufflechannel.cpp b/tests/test_shufflechannel.cpp
index ad21a184e89..ea528747d50 100644
--- a/tests/test_shufflechannel.cpp
+++ b/tests/test_shufflechannel.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/shufflechannel.h"
 #include "testutil.h"
 
 static int test_shufflechannel(int w, int h, int c, int group, int reverse)
@@ -25,7 +24,7 @@ static int test_shufflechannel(int w, int h, int c, int group, int reverse)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ShuffleChannel>("ShuffleChannel", pd, weights, a);
+    int ret = test_layer("ShuffleChannel", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_shufflechannel failed w=%d h=%d c=%d group=%d reverse=%d\n", w, h, c, group, reverse);
diff --git a/tests/test_sigmoid.cpp b/tests/test_sigmoid.cpp
index ba03a8d5a3f..83e98e89dd7 100644
--- a/tests/test_sigmoid.cpp
+++ b/tests/test_sigmoid.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/sigmoid.h"
 #include "testutil.h"
 
 static int test_sigmoid(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_sigmoid(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Sigmoid>("Sigmoid", pd, weights, a);
+    int ret = test_layer("Sigmoid", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_sigmoid failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_slice.cpp b/tests/test_slice.cpp
index 59cf10e8d68..dd7c8d0e23b 100644
--- a/tests/test_slice.cpp
+++ b/tests/test_slice.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/slice.h"
 #include "testutil.h"
 
 static ncnn::Mat IntArrayMat(int a0)
@@ -76,7 +75,7 @@ static int test_slice(const ncnn::Mat& a, const ncnn::Mat& slices, int axis)
     std::vector<ncnn::Mat> a0(1);
     a0[0] = a;
 
-    int ret = test_layer<ncnn::Slice>("Slice", pd, weights, a0, slices.w);
+    int ret = test_layer("Slice", pd, weights, a0, slices.w);
     if (ret != 0)
     {
         fprintf(stderr, "test_slice failed a.dims=%d a=(%d %d %d %d)", a.dims, a.w, a.h, a.d, a.c);
@@ -99,7 +98,7 @@ static int test_slice_indices(const ncnn::Mat& a, const ncnn::Mat& indices, int
     std::vector<ncnn::Mat> a0(1);
     a0[0] = a;
 
-    int ret = test_layer<ncnn::Slice>("Slice", pd, weights, a0, indices.w);
+    int ret = test_layer("Slice", pd, weights, a0, indices.w);
     if (ret != 0)
     {
         fprintf(stderr, "test_slice_indices failed a.dims=%d a=(%d %d %d %d)", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_softmax.cpp b/tests/test_softmax.cpp
index cbbfd932849..c26dfce7158 100644
--- a/tests/test_softmax.cpp
+++ b/tests/test_softmax.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/softmax.h"
 #include "testutil.h"
 
 static int test_softmax(const ncnn::Mat& a, int axis)
@@ -23,7 +22,7 @@ static int test_softmax(const ncnn::Mat& a, int axis)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Softmax>("Softmax", pd, weights, a);
+    int ret = test_layer("Softmax", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_softmax failed a.dims=%d a=(%d %d %d) axis=%d\n", a.dims, a.w, a.h, a.c, axis);
diff --git a/tests/test_softplus.cpp b/tests/test_softplus.cpp
index 2bd37567a0b..1aa6c3aff98 100644
--- a/tests/test_softplus.cpp
+++ b/tests/test_softplus.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/softplus.h"
 #include "testutil.h"
 
 static int test_softplus(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_softplus(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Softplus>("Softplus", pd, weights, a);
+    int ret = test_layer("Softplus", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_softplus failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_squeeze.cpp b/tests/test_squeeze.cpp
index 403f95bdf9b..02f772c8581 100644
--- a/tests/test_squeeze.cpp
+++ b/tests/test_squeeze.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/squeeze.h"
 #include "testutil.h"
 
 static int test_squeeze(const ncnn::Mat& a, int squeeze_w, int squeeze_h, int squeeze_d, int squeeze_c)
@@ -25,7 +24,7 @@ static int test_squeeze(const ncnn::Mat& a, int squeeze_w, int squeeze_h, int sq
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Squeeze>("Squeeze", pd, weights, a);
+    int ret = test_layer("Squeeze", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_squeeze failed a.dims=%d a=(%d %d %d %d) squeeze_w=%d squeeze_h=%d squeeze_d=%d squeeze_c=%d\n", a.dims, a.w, a.h, a.d, a.c, squeeze_w, squeeze_h, squeeze_d, squeeze_c);
@@ -91,7 +90,7 @@ static int test_squeeze_axes(const ncnn::Mat& a, const ncnn::Mat& axes)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Squeeze>("Squeeze", pd, weights, a);
+    int ret = test_layer("Squeeze", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_squeeze_axes failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_swish.cpp b/tests/test_swish.cpp
index 3a1fbfa4d14..b67c5d4bd52 100644
--- a/tests/test_swish.cpp
+++ b/tests/test_swish.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/swish.h"
 #include "testutil.h"
 
 static int test_swish(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_swish(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Swish>("Swish", pd, weights, a);
+    int ret = test_layer("Swish", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_swish failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_tanh.cpp b/tests/test_tanh.cpp
index 0cbfe6bfcac..141a5cf37d7 100644
--- a/tests/test_tanh.cpp
+++ b/tests/test_tanh.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/tanh.h"
 #include "testutil.h"
 
 static int test_tanh(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_tanh(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::TanH>("TanH", pd, weights, a);
+    int ret = test_layer("TanH", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_tanh failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_tile.cpp b/tests/test_tile.cpp
index 2b3595071f3..ffc238eb10c 100644
--- a/tests/test_tile.cpp
+++ b/tests/test_tile.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/tile.h"
 #include "testutil.h"
 
 static int test_tile(const ncnn::Mat& a, int axis, int tiles)
@@ -23,7 +22,7 @@ static int test_tile(const ncnn::Mat& a, int axis, int tiles)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Tile>("Tile", pd, weights, a);
+    int ret = test_layer("Tile", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_tile failed a.dims=%d a=(%d %d %d %d) axis=%d tiles=%d\n", a.dims, a.w, a.h, a.d, a.c, axis, tiles);
@@ -89,7 +88,7 @@ static int test_tile(const ncnn::Mat& a, const ncnn::Mat& repeats)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Tile>("Tile", pd, weights, a);
+    int ret = test_layer("Tile", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_tile failed a.dims=%d a=(%d %d %d %d) repeats=", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_unaryop.cpp b/tests/test_unaryop.cpp
index 44274fd071f..3ff6ad8de57 100644
--- a/tests/test_unaryop.cpp
+++ b/tests/test_unaryop.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/unaryop.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 20
@@ -46,7 +45,7 @@ static int test_unaryop(const ncnn::Mat& _a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::UnaryOp>("UnaryOp", pd, weights, a);
+    int ret = test_layer("UnaryOp", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_unaryop failed a.dims=%d a=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, op_type);
diff --git a/tests/test_unfold.cpp b/tests/test_unfold.cpp
index 4eea1d020ea..12b4066d341 100644
--- a/tests/test_unfold.cpp
+++ b/tests/test_unfold.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/unfold.h"
 #include "testutil.h"
 
 static int test_unfold(int w, int h, int c, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_w, int pad_h, float pad_value)
@@ -32,7 +31,7 @@ static int test_unfold(int w, int h, int c, int kernel_w, int kernel_h, int dila
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Unfold>("Unfold", pd, weights, a);
+    int ret = test_layer("Unfold", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_unfold failed w=%d h=%d c=%d kernel=%d,%d dilation=%d,%d stride=%d,%d pad=%d,%d pad_value=%f\n", w, h, c, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_w, pad_h, pad_value);
diff --git a/tests/test_yolov3detectionoutput.cpp b/tests/test_yolov3detectionoutput.cpp
index 1ad931a4719..c4b1d32b10a 100644
--- a/tests/test_yolov3detectionoutput.cpp
+++ b/tests/test_yolov3detectionoutput.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/yolov3detectionoutput.h"
 #include "testutil.h"
 
 static int test_yolov3detectionoutput(const std::vector<ncnn::Mat>& a, int num_class,
@@ -30,7 +29,7 @@ static int test_yolov3detectionoutput(const std::vector<ncnn::Mat>& a, int num_c
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Yolov3DetectionOutput>("Yolov3DetectionOutput", pd, weights, a);
+    int ret = test_layer("Yolov3DetectionOutput", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_yolov3detectionoutput failed a.dims=%d a=(%d %d %d) ", a[0].dims, a[0].w, a[0].h, a[0].c);
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
new file mode 100644
index 00000000000..f0bf3c51a20
--- /dev/null
+++ b/tests/testutil.cpp
@@ -0,0 +1,1559 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "testutil.h"
+
+#include "cpu.h"
+#include "layer.h"
+#include "mat.h"
+#include "prng.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "gpu.h"
+#endif // NCNN_VULKAN
+
+static struct prng_rand_t g_prng_rand_state;
+
+void SRAND(int seed)
+{
+    prng_srand(seed, &g_prng_rand_state);
+}
+
+uint64_t RAND()
+{
+    return prng_rand(&g_prng_rand_state);
+}
+
+float RandomFloat(float a, float b)
+{
+    float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
+    float diff = b - a;
+    float r = random * diff;
+    float v = a + r;
+    // generate denormal as zero
+    if (v < 0.0001 && v > -0.0001)
+        v = 0.f;
+    return v;
+}
+
+int RandomInt(int a, int b)
+{
+    float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
+    int diff = b - a;
+    float r = random * diff;
+    return a + (int)r;
+}
+
+signed char RandomS8()
+{
+    return (signed char)RandomInt(-127, 127);
+}
+
+void Randomize(ncnn::Mat& m, float a, float b)
+{
+    for (size_t i = 0; i < m.total(); i++)
+    {
+        m[i] = RandomFloat(a, b);
+    }
+}
+
+void RandomizeInt(ncnn::Mat& m, int a, int b)
+{
+    for (size_t i = 0; i < m.total(); i++)
+    {
+        ((int*)m)[i] = RandomInt(a, b);
+    }
+}
+
+void RandomizeS8(ncnn::Mat& m)
+{
+    for (size_t i = 0; i < m.total(); i++)
+    {
+        ((signed char*)m)[i] = RandomS8();
+    }
+}
+
+ncnn::Mat RandomMat(int w, float a, float b)
+{
+    ncnn::Mat m(w);
+    Randomize(m, a, b);
+    return m;
+}
+
+ncnn::Mat RandomMat(int w, int h, float a, float b)
+{
+    ncnn::Mat m(w, h);
+    Randomize(m, a, b);
+    return m;
+}
+
+ncnn::Mat RandomMat(int w, int h, int c, float a, float b)
+{
+    ncnn::Mat m(w, h, c);
+    Randomize(m, a, b);
+    return m;
+}
+
+ncnn::Mat RandomMat(int w, int h, int d, int c, float a, float b)
+{
+    ncnn::Mat m(w, h, d, c);
+    Randomize(m, a, b);
+    return m;
+}
+
+ncnn::Mat RandomIntMat(int w)
+{
+    ncnn::Mat m(w);
+    RandomizeInt(m);
+    return m;
+}
+
+ncnn::Mat RandomIntMat(int w, int h)
+{
+    ncnn::Mat m(w, h);
+    RandomizeInt(m);
+    return m;
+}
+
+ncnn::Mat RandomIntMat(int w, int h, int c)
+{
+    ncnn::Mat m(w, h, c);
+    RandomizeInt(m);
+    return m;
+}
+
+ncnn::Mat RandomIntMat(int w, int h, int d, int c)
+{
+    ncnn::Mat m(w, h, d, c);
+    RandomizeInt(m);
+    return m;
+}
+
+ncnn::Mat RandomS8Mat(int w)
+{
+    ncnn::Mat m(w, (size_t)1u);
+    RandomizeS8(m);
+    return m;
+}
+
+ncnn::Mat RandomS8Mat(int w, int h)
+{
+    ncnn::Mat m(w, h, (size_t)1u);
+    RandomizeS8(m);
+    return m;
+}
+
+ncnn::Mat RandomS8Mat(int w, int h, int c)
+{
+    ncnn::Mat m(w, h, c, (size_t)1u);
+    RandomizeS8(m);
+    return m;
+}
+
+ncnn::Mat RandomS8Mat(int w, int h, int d, int c)
+{
+    ncnn::Mat m(w, h, d, c, (size_t)1u);
+    RandomizeS8(m);
+    return m;
+}
+
+ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx)
+{
+    ncnn::Mat weight_scales(m);
+    for (int i = 0; i < m; ++i)
+    {
+        float min = mat[0], _max = mat[0];
+        const float* ptr = (const float*)(mat.data) + i * ldx;
+        for (int j = 0; j < k; ++j)
+        {
+            if (min > ptr[j])
+            {
+                min = ptr[j];
+            }
+            if (_max < ptr[j])
+            {
+                _max = ptr[j];
+            }
+        }
+        const float abs_min = abs(min), abs_max = abs(_max);
+        weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max);
+    }
+    return weight_scales;
+}
+
+bool NearlyEqual(float a, float b, float epsilon)
+{
+    if (a == b)
+        return true;
+
+    float diff = (float)fabs(a - b);
+    if (diff <= epsilon)
+        return true;
+
+    // relative error
+    return diff < epsilon * std::max(fabs(a), fabs(b));
+}
+
+int Compare(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon)
+{
+#define CHECK_MEMBER(m)                                                                 \
+    if (a.m != b.m)                                                                     \
+    {                                                                                   \
+        fprintf(stderr, #m " not match    expect %d but got %d\n", (int)a.m, (int)b.m); \
+        return -1;                                                                      \
+    }
+
+    CHECK_MEMBER(dims)
+    CHECK_MEMBER(w)
+    CHECK_MEMBER(h)
+    CHECK_MEMBER(d)
+    CHECK_MEMBER(c)
+    CHECK_MEMBER(elemsize)
+    CHECK_MEMBER(elempack)
+
+#undef CHECK_MEMBER
+
+    for (int q = 0; q < a.c; q++)
+    {
+        const ncnn::Mat ma = a.channel(q);
+        const ncnn::Mat mb = b.channel(q);
+        for (int z = 0; z < a.d; z++)
+        {
+            const ncnn::Mat da = ma.depth(z);
+            const ncnn::Mat db = mb.depth(z);
+            for (int i = 0; i < a.h; i++)
+            {
+                const float* pa = da.row(i);
+                const float* pb = db.row(i);
+                for (int j = 0; j < a.w; j++)
+                {
+                    if (!NearlyEqual(pa[j], pb[j], epsilon))
+                    {
+                        fprintf(stderr, "value not match  at c:%d d:%d h:%d w:%d    expect %f but got %f\n", q, z, i, j, pa[j], pb[j]);
+                        return -1;
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int CompareMat(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon)
+{
+    ncnn::Option opt;
+    opt.num_threads = 1;
+
+    if (a.elempack != 1)
+    {
+        ncnn::Mat a1;
+        ncnn::convert_packing(a, a1, 1, opt);
+        return CompareMat(a1, b, epsilon);
+    }
+
+    if (b.elempack != 1)
+    {
+        ncnn::Mat b1;
+        ncnn::convert_packing(b, b1, 1, opt);
+        return CompareMat(a, b1, epsilon);
+    }
+
+    if (a.elemsize == 2u)
+    {
+        ncnn::Mat a32;
+        cast_float16_to_float32(a, a32, opt);
+        return CompareMat(a32, b, epsilon);
+    }
+    if (a.elemsize == 1u)
+    {
+        ncnn::Mat a32;
+        cast_int8_to_float32(a, a32, opt);
+        return CompareMat(a32, b, epsilon);
+    }
+
+    if (b.elemsize == 2u)
+    {
+        ncnn::Mat b32;
+        cast_float16_to_float32(b, b32, opt);
+        return CompareMat(a, b32, epsilon);
+    }
+    if (b.elemsize == 1u)
+    {
+        ncnn::Mat b32;
+        cast_int8_to_float32(b, b32, opt);
+        return CompareMat(a, b32, epsilon);
+    }
+
+    return Compare(a, b, epsilon);
+}
+
+int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::Mat>& b, float epsilon)
+{
+    if (a.size() != b.size())
+    {
+        fprintf(stderr, "output blob count not match %zu %zu\n", a.size(), b.size());
+        return -1;
+    }
+
+    for (size_t i = 0; i < a.size(); i++)
+    {
+        if (CompareMat(a[i], b[i], epsilon))
+        {
+            fprintf(stderr, "output blob %zu not match\n", i);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(ncnn::Layer*), int flag)
+{
+    ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    op->load_param(pd);
+
+    if (op->one_blob_only && a.size() != 1)
+    {
+        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
+        delete op;
+        return -1;
+    }
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.lightmode = false;
+    opt.use_packing_layout = false;
+    opt.use_fp16_packed = false;
+    opt.use_fp16_storage = false;
+    opt.use_fp16_arithmetic = false;
+    opt.use_shader_pack8 = false;
+    opt.use_image_storage = false;
+    opt.use_bf16_storage = false;
+    opt.use_vulkan_compute = false;
+
+    op->create_pipeline(opt);
+
+    b.resize(top_blob_count);
+
+    if (op->support_inplace)
+    {
+        for (size_t i = 0; i < a.size(); i++)
+        {
+            b[i] = a[i].clone();
+        }
+
+        op->forward_inplace(b, opt);
+    }
+    else
+    {
+        op->forward(a, b, opt);
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
+{
+    ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        delete op;
+        return 233;
+    }
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    if (!top_shapes.empty())
+    {
+        op->bottom_shapes = a;
+        op->top_shapes = top_shapes;
+    }
+
+    op->load_param(pd);
+
+    if (op->one_blob_only && a.size() != 1)
+    {
+        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
+        delete op;
+        return -1;
+    }
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::Option opt = _opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = false;
+
+    op->create_pipeline(opt);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+
+    std::vector<ncnn::Mat> a4(a.size());
+
+    for (size_t i = 0; i < a4.size(); i++)
+    {
+        // clang-format off
+        // *INDENT-OFF*
+#if NCNN_VFPV4
+        if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+        {
+            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
+        }
+        else
+#endif // NCNN_VFPV4
+#if NCNN_RVV
+        if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+        {
+            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
+        }
+        else
+#endif // NCNN_RVV
+#if NCNN_BF16
+        if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+        {
+            ncnn::cast_float32_to_bfloat16(a[i], a4[i], opt);
+        }
+        else
+#endif // NCNN_BF16
+        if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+        {
+            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
+        }
+        else
+        {
+            a4[i] = a[i];
+        }
+        // *INDENT-ON*
+        // clang-format on
+
+        if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
+        {
+            // resolve dst_elempack
+            int dims = a4[i].dims;
+            int elemcount = 0;
+            if (dims == 1) elemcount = a4[i].elempack * a4[i].w;
+            if (dims == 2) elemcount = a4[i].elempack * a4[i].h;
+            if (dims == 3 || dims == 4) elemcount = a4[i].elempack * a4[i].c;
+
+            int elembits = a4[i].elembits();
+
+            int dst_elempack = 1;
+
+            if (elembits == 32)
+            {
+#if NCNN_AVX512
+                if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
+                    dst_elempack = 16;
+                else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
+                    dst_elempack = 8;
+                else if (elemcount % 4 == 0)
+                    dst_elempack = 4;
+#elif NCNN_AVX
+                if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
+                    dst_elempack = 8;
+                else if (elemcount % 4 == 0)
+                    dst_elempack = 4;
+#elif NCNN_RVV
+                const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
+                if (elemcount % packn == 0)
+                    dst_elempack = packn;
+#else
+                if (elemcount % 4 == 0)
+                    dst_elempack = 4;
+#endif
+            }
+            if (elembits == 16)
+            {
+#if NCNN_ARM82
+                if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
+                    dst_elempack = 8;
+                else if (elemcount % 4 == 0)
+                    dst_elempack = 4;
+#elif NCNN_RVV
+                const int packn = ncnn::cpu_riscv_vlenb() / 2;
+                if (elemcount % packn == 0)
+                    dst_elempack = packn;
+#else
+                if (elemcount % 4 == 0)
+                    dst_elempack = 4;
+#endif
+            }
+            if (elembits == 8)
+            {
+#if NCNN_RVV
+                const int packn = ncnn::cpu_riscv_vlenb() / 1;
+                if (elemcount % packn == 0)
+                    dst_elempack = packn;
+#else
+                if (elemcount % 8 == 0)
+                    dst_elempack = 8;
+#endif
+            }
+
+            if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
+                dst_elempack = 8;
+
+            ncnn::Mat a4_packed;
+            ncnn::convert_packing(a4[i], a4_packed, dst_elempack, opt);
+            a4[i] = a4_packed;
+        }
+    }
+
+    c.resize(top_blob_count);
+
+    if (op->support_inplace)
+    {
+        for (size_t i = 0; i < a4.size(); i++)
+        {
+            c[i] = a4[i].clone();
+        }
+
+        op->forward_inplace(c, opt);
+    }
+    else
+    {
+        op->forward(a4, c, opt);
+    }
+
+    for (size_t i = 0; i < c.size(); i++)
+    {
+        // clang-format off
+        // *INDENT-OFF*
+#if NCNN_VFPV4
+        if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c[i].elembits() == 16)
+        {
+            ncnn::Mat c_fp32;
+            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
+            c[i] = c_fp32;
+        }
+        else
+#endif // NCNN_VFPV4
+#if NCNN_RVV
+        if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c[i].elembits() == 16)
+        {
+            ncnn::Mat c_fp32;
+            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
+            c[i] = c_fp32;
+        }
+        else
+#endif // NCNN_RVV
+#if NCNN_BF16
+        if (opt.use_bf16_storage && op->support_bf16_storage && c[i].elembits() == 16)
+        {
+            ncnn::Mat c_fp32;
+            ncnn::cast_bfloat16_to_float32(c[i], c_fp32, opt);
+            c[i] = c_fp32;
+        }
+        else
+#endif // NCNN_BF16
+        if (opt.use_fp16_storage && op->support_fp16_storage && c[i].elembits() == 16)
+        {
+            ncnn::Mat c_fp32;
+            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
+            c[i] = c_fp32;
+        }
+        // *INDENT-ON*
+        // clang-format on
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+#if NCNN_VULKAN
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
+{
+    if (!_opt.use_packing_layout)
+    {
+        // pack1 test is useless for gpu
+        return 233;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
+    if (!op)
+    {
+        return 233;
+    }
+
+    op->load_param(pd);
+
+    if (!op->support_vulkan)
+    {
+        delete op;
+        return 233;
+    }
+
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+
+    op->vkdev = vkdev;
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    if (!top_shapes.empty())
+    {
+        op->bottom_shapes = a;
+        op->top_shapes = top_shapes;
+    }
+
+    if (op->one_blob_only && a.size() != 1)
+    {
+        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
+        delete op;
+        return -1;
+    }
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
+    ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
+
+    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
+    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
+
+    ncnn::Option opt = _opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = true;
+
+#if __APPLE__
+    opt.use_image_storage = false;
+#endif
+
+    opt.blob_vkallocator = blob_vkallocator;
+    opt.workspace_vkallocator = blob_vkallocator;
+    opt.staging_vkallocator = staging_vkallocator;
+
+    if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
+    if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
+    if (!vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
+    if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
+    if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
+    if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
+    if (!vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
+    if (!vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
+    if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
+
+    // FIXME fp16a may produce large error
+    opt.use_fp16_arithmetic = false;
+
+    op->create_pipeline(opt);
+
+    if (!op->support_vulkan)
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+
+    {
+        ncnn::VkTransfer cmd(vkdev);
+
+        ncnn::Option opt_upload = opt;
+        opt_upload.blob_vkallocator = &g_weight_vkallocator;
+        opt_upload.workspace_vkallocator = &g_weight_vkallocator;
+        opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
+
+        op->upload_model(cmd, opt_upload);
+
+        cmd.submit_and_wait();
+    }
+
+    d.resize(top_blob_count);
+
+    {
+        // forward
+        ncnn::VkCompute cmd(vkdev);
+
+        if (op->support_image_storage && opt.use_image_storage)
+        {
+            // upload
+            std::vector<ncnn::VkImageMat> a_gpu(a.size());
+            for (size_t i = 0; i < a_gpu.size(); i++)
+            {
+                cmd.record_upload(a[i], a_gpu[i], opt);
+            }
+
+            std::vector<ncnn::VkImageMat> d_gpu(top_blob_count);
+            if (op->support_inplace)
+            {
+                op->forward_inplace(a_gpu, cmd, opt);
+
+                d_gpu = a_gpu;
+            }
+            else
+            {
+                op->forward(a_gpu, d_gpu, cmd, opt);
+            }
+
+            // download
+            for (size_t i = 0; i < d_gpu.size(); i++)
+            {
+                cmd.record_download(d_gpu[i], d[i], opt);
+            }
+        }
+        else
+        {
+            // upload
+            std::vector<ncnn::VkMat> a_gpu(a.size());
+            for (size_t i = 0; i < a_gpu.size(); i++)
+            {
+                cmd.record_upload(a[i], a_gpu[i], opt);
+            }
+
+            std::vector<ncnn::VkMat> d_gpu(top_blob_count);
+            if (op->support_inplace)
+            {
+                op->forward_inplace(a_gpu, cmd, opt);
+
+                d_gpu = a_gpu;
+            }
+            else
+            {
+                op->forward(a_gpu, d_gpu, cmd, opt);
+            }
+
+            // download
+            for (size_t i = 0; i < d_gpu.size(); i++)
+            {
+                cmd.record_download(d_gpu[i], d[i], opt);
+            }
+        }
+
+        cmd.submit_and_wait();
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    vkdev->reclaim_blob_allocator(blob_vkallocator);
+    vkdev->reclaim_staging_allocator(staging_vkallocator);
+    g_weight_vkallocator.clear();
+    g_weight_staging_vkallocator.clear();
+
+    return 0;
+}
+#endif // NCNN_VULKAN
+
+int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // naive
+    std::vector<ncnn::Mat> b;
+    {
+        int ret = test_layer_naive(typeindex, pd, weights, a, top_blob_count, b, func, flag);
+        if (ret != 233 && ret != 0)
+        {
+            fprintf(stderr, "test_layer_naive failed\n");
+            return -1;
+        }
+    }
+
+    // cpu
+    {
+        std::vector<ncnn::Mat> c;
+        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, std::vector<ncnn::Mat>(), func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_cpu failed\n");
+            return -1;
+        }
+    }
+
+    // cpu shape hint
+    {
+        std::vector<ncnn::Mat> c;
+        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, b, func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_cpu failed with shape hint\n");
+            return -1;
+        }
+    }
+
+#if NCNN_VULKAN
+    // gpu
+    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
+    {
+        std::vector<ncnn::Mat> d;
+        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, std::vector<ncnn::Mat>(), func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_gpu failed\n");
+            return -1;
+        }
+    }
+
+    // gpu shape hint
+    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
+    {
+        std::vector<ncnn::Mat> d;
+        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, b, func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_gpu failed with shape hint\n");
+            return -1;
+        }
+    }
+#endif // NCNN_VULKAN
+
+    return 0;
+}
+
+int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(ncnn::Layer*), int flag)
+{
+    ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.lightmode = false;
+    opt.use_packing_layout = false;
+    opt.use_fp16_packed = false;
+    opt.use_fp16_storage = false;
+    opt.use_fp16_arithmetic = false;
+    opt.use_shader_pack8 = false;
+    opt.use_image_storage = false;
+    opt.use_bf16_storage = false;
+    opt.use_vulkan_compute = false;
+
+    op->create_pipeline(opt);
+
+    if (op->support_inplace)
+    {
+        b = a.clone();
+        op->forward_inplace(b, opt);
+    }
+    else
+    {
+        op->forward(a, b, opt);
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
+{
+    ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        delete op;
+        return 233;
+    }
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    if (top_shape.dims)
+    {
+        op->bottom_shapes.resize(1);
+        op->top_shapes.resize(1);
+        op->bottom_shapes[0] = a;
+        op->top_shapes[0] = top_shape;
+    }
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::Option opt = _opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = false;
+
+    op->create_pipeline(opt);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+
+    ncnn::Mat a4;
+
+    // clang-format off
+    // *INDENT-OFF*
+#if NCNN_VFPV4
+    if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_float16(a, a4, opt);
+    }
+    else
+#endif // NCNN_VFPV4
+#if NCNN_RVV
+    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_float16(a, a4, opt);
+    }
+    else
+#endif // NCNN_RVV
+#if NCNN_BF16
+    if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_bfloat16(a, a4, opt);
+    }
+    else
+#endif // NCNN_BF16
+    if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_float16(a, a4, opt);
+    }
+    else
+    {
+        a4 = a;
+    }
+    // *INDENT-ON*
+    // clang-format on
+
+    if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
+    {
+        // resolve dst_elempack
+        int dims = a4.dims;
+        int elemcount = 0;
+        if (dims == 1) elemcount = a4.elempack * a4.w;
+        if (dims == 2) elemcount = a4.elempack * a4.h;
+        if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c;
+
+        int elembits = a4.elembits();
+
+        int dst_elempack = 1;
+
+        if (elembits == 32)
+        {
+#if NCNN_AVX512
+            if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
+                dst_elempack = 16;
+            else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#elif NCNN_AVX
+            if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#elif NCNN_RVV
+            const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
+            if (elemcount % packn == 0)
+                dst_elempack = packn;
+#else
+            if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#endif
+        }
+        if (elembits == 16)
+        {
+#if NCNN_ARM82
+            if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#elif NCNN_RVV
+            const int packn = ncnn::cpu_riscv_vlenb() / 2;
+            if (elemcount % packn == 0)
+                dst_elempack = packn;
+#else
+            if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#endif
+        }
+        if (elembits == 8)
+        {
+#if NCNN_RVV
+            const int packn = ncnn::cpu_riscv_vlenb() / 1;
+            if (elemcount % packn == 0)
+                dst_elempack = packn;
+#else
+            if (elemcount % 8 == 0)
+                dst_elempack = 8;
+#endif
+        }
+
+        if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
+            dst_elempack = 8;
+
+        ncnn::Mat a4_packed;
+        ncnn::convert_packing(a4, a4_packed, dst_elempack, opt);
+        a4 = a4_packed;
+    }
+
+    if (op->support_inplace)
+    {
+        c = a4.clone();
+        op->forward_inplace(c, opt);
+    }
+    else
+    {
+        op->forward(a4, c, opt);
+    }
+
+    // clang-format off
+    // *INDENT-OFF*
+#if NCNN_VFPV4
+    if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c.elembits() == 16)
+    {
+        ncnn::Mat c_fp32;
+        ncnn::cast_float16_to_float32(c, c_fp32, opt);
+        c = c_fp32;
+    }
+    else
+#endif // NCNN_VFPV4
+#if NCNN_RVV
+    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c.elembits() == 16)
+    {
+        ncnn::Mat c_fp32;
+        ncnn::cast_float16_to_float32(c, c_fp32, opt);
+        c = c_fp32;
+    }
+    else
+#endif // NCNN_RVV
+#if NCNN_BF16
+    if (opt.use_bf16_storage && op->support_bf16_storage && c.elembits() == 16)
+    {
+        ncnn::Mat c_fp32;
+        ncnn::cast_bfloat16_to_float32(c, c_fp32, opt);
+        c = c_fp32;
+    }
+    else
+#endif // NCNN_BF16
+    if (opt.use_fp16_storage && op->support_fp16_storage && c.elembits() == 16)
+    {
+        ncnn::Mat c_fp32;
+        ncnn::cast_float16_to_float32(c, c_fp32, opt);
+        c = c_fp32;
+    }
+    // *INDENT-ON*
+    // clang-format on
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+#if NCNN_VULKAN
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
+{
+    if (!_opt.use_packing_layout)
+    {
+        // pack1 test is useless for gpu
+        return 233;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
+    if (!op)
+    {
+        return 233;
+    }
+
+    op->load_param(pd);
+
+    if (!op->support_vulkan)
+    {
+        delete op;
+        return 233;
+    }
+
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+
+    op->vkdev = vkdev;
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    if (top_shape.dims)
+    {
+        op->bottom_shapes.resize(1);
+        op->top_shapes.resize(1);
+        op->bottom_shapes[0] = a;
+        op->top_shapes[0] = top_shape;
+    }
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
+    ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
+
+    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
+    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
+
+    ncnn::Option opt = _opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = true;
+
+#if __APPLE__
+    opt.use_image_storage = false;
+#endif
+
+    opt.blob_vkallocator = blob_vkallocator;
+    opt.workspace_vkallocator = blob_vkallocator;
+    opt.staging_vkallocator = staging_vkallocator;
+
+    if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
+    if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
+    if (!vkdev->info.support_fp16_uniform()) opt.use_fp16_uniform = false;
+    if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
+    if (!vkdev->info.support_int8_packed()) opt.use_int8_packed = false;
+    if (!vkdev->info.support_int8_storage()) opt.use_int8_storage = false;
+    if (!vkdev->info.support_int8_uniform()) opt.use_int8_uniform = false;
+    if (!vkdev->info.support_int8_arithmetic()) opt.use_int8_arithmetic = false;
+    if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
+
+    // FIXME fp16a may produce large error
+    opt.use_fp16_arithmetic = false;
+
+    op->create_pipeline(opt);
+
+    if (!op->support_vulkan)
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+
+    {
+        ncnn::VkTransfer cmd(vkdev);
+
+        ncnn::Option opt_upload = opt;
+        opt_upload.blob_vkallocator = &g_weight_vkallocator;
+        opt_upload.workspace_vkallocator = &g_weight_vkallocator;
+        opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
+
+        op->upload_model(cmd, opt_upload);
+
+        cmd.submit_and_wait();
+    }
+
+    {
+        // forward
+        ncnn::VkCompute cmd(vkdev);
+
+        if (op->support_image_storage && opt.use_image_storage)
+        {
+            // upload
+            ncnn::VkImageMat a_gpu;
+            cmd.record_upload(a, a_gpu, opt);
+
+            ncnn::VkImageMat d_gpu;
+            if (op->support_inplace)
+            {
+                op->forward_inplace(a_gpu, cmd, opt);
+
+                d_gpu = a_gpu;
+            }
+            else
+            {
+                op->forward(a_gpu, d_gpu, cmd, opt);
+            }
+
+            // download
+            cmd.record_download(d_gpu, d, opt);
+        }
+        else
+        {
+            // upload
+            ncnn::VkMat a_gpu;
+            cmd.record_upload(a, a_gpu, opt);
+
+            ncnn::VkMat d_gpu;
+            if (op->support_inplace)
+            {
+                op->forward_inplace(a_gpu, cmd, opt);
+
+                d_gpu = a_gpu;
+            }
+            else
+            {
+                op->forward(a_gpu, d_gpu, cmd, opt);
+            }
+
+            // download
+            cmd.record_download(d_gpu, d, opt);
+        }
+
+        cmd.submit_and_wait();
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    vkdev->reclaim_blob_allocator(blob_vkallocator);
+    vkdev->reclaim_staging_allocator(staging_vkallocator);
+    g_weight_vkallocator.clear();
+    g_weight_staging_vkallocator.clear();
+
+    return 0;
+}
+#endif // NCNN_VULKAN
+
+int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // naive
+    ncnn::Mat b;
+    {
+        int ret = test_layer_naive(typeindex, pd, weights, a, b, func, flag);
+        if (ret != 233 && ret != 0)
+        {
+            fprintf(stderr, "test_layer_naive failed\n");
+            return -1;
+        }
+    }
+
+    // cpu
+    {
+        ncnn::Mat c;
+        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, ncnn::Mat(), func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_cpu failed\n");
+            return -1;
+        }
+    }
+
+    // cpu shape hint
+    {
+        ncnn::Mat c;
+        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, b, func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_cpu failed with shape hint\n");
+            return -1;
+        }
+    }
+
+#if NCNN_VULKAN
+    // gpu
+    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
+    {
+        ncnn::Mat d;
+        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, ncnn::Mat(), func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_gpu failed\n");
+            return -1;
+        }
+    }
+
+    // gpu shape hint
+    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
+    {
+        ncnn::Mat d;
+        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, b, func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_gpu failed with shape hint\n");
+            return -1;
+        }
+    }
+#endif // NCNN_VULKAN
+
+    return 0;
+}
+
+int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // fp16 representation
+    std::vector<ncnn::Mat> a_fp16;
+    if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        a_fp16.resize(a.size());
+        for (size_t j = 0; j < a.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_bfloat16(a[j], tmp, opt);
+            ncnn::cast_bfloat16_to_float32(tmp, a_fp16[j], opt);
+        }
+    }
+    else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        a_fp16.resize(a.size());
+        for (size_t j = 0; j < a.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_float16(a[j], tmp, opt);
+            ncnn::cast_float16_to_float32(tmp, a_fp16[j], opt);
+        }
+    }
+    else
+    {
+        a_fp16 = a;
+    }
+
+    std::vector<ncnn::Mat> weights_fp16;
+    float epsilon_fp16;
+    if (opt.use_bf16_storage)
+    {
+        weights_fp16.resize(weights.size());
+        for (size_t j = 0; j < weights.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
+            ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
+        }
+        epsilon_fp16 = epsilon * 100; // 0.1
+    }
+    else if (opt.use_fp16_packed || opt.use_fp16_storage)
+    {
+        weights_fp16.resize(weights.size());
+        for (size_t j = 0; j < weights.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_float16(weights[j], tmp, opt);
+            ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
+        }
+        epsilon_fp16 = epsilon * 100; // 0.1
+    }
+    else
+    {
+        weights_fp16 = weights;
+        epsilon_fp16 = epsilon;
+    }
+
+    if (opt.use_fp16_arithmetic)
+    {
+        epsilon_fp16 = epsilon * 1000; // 1.0
+    }
+
+    std::vector<ncnn::Mat> top_shapes;
+    int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_blob_count, top_shapes, epsilon_fp16, func, flag);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
+        return ret;
+    }
+
+    return 0;
+}
+
+int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // fp16 representation
+    ncnn::Mat a_fp16;
+    if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::Mat tmp;
+        ncnn::cast_float32_to_bfloat16(a, tmp, opt);
+        ncnn::cast_bfloat16_to_float32(tmp, a_fp16, opt);
+    }
+    else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::Mat tmp;
+        ncnn::cast_float32_to_float16(a, tmp, opt);
+        ncnn::cast_float16_to_float32(tmp, a_fp16, opt);
+    }
+    else
+    {
+        a_fp16 = a;
+    }
+
+    std::vector<ncnn::Mat> weights_fp16;
+    float epsilon_fp16;
+    if (opt.use_bf16_storage)
+    {
+        weights_fp16.resize(weights.size());
+        for (size_t j = 0; j < weights.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
+            ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
+        }
+        epsilon_fp16 = epsilon * 100; // 0.1
+    }
+    else if (opt.use_fp16_packed || opt.use_fp16_storage)
+    {
+        weights_fp16.resize(weights.size());
+        for (size_t j = 0; j < weights.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_float16(weights[j], tmp, opt);
+            ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
+        }
+        epsilon_fp16 = epsilon * 100; // 0.1
+    }
+    else
+    {
+        weights_fp16 = weights;
+        epsilon_fp16 = epsilon;
+    }
+
+    if (opt.use_fp16_arithmetic)
+    {
+        epsilon_fp16 = epsilon * 1000; // 1.0
+    }
+
+    ncnn::Mat top_shape;
+    int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_shape, epsilon_fp16, func, flag);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
+        return ret;
+    }
+
+    return 0;
+}
+
+int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // pack fp16p fp16s fp16a bf16s shader8 image
+    const int options[][7] = {
+        {0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 1, 0, 0, 0, 0},
+        {0, 0, 1, 1, 1, 0, 0},
+        {1, 0, 0, 0, 0, 0, 0},
+        {1, 1, 0, 0, 1, 0, 0},
+        {1, 0, 1, 0, 0, 1, 0},
+        {1, 1, 1, 1, 0, 0, 0},
+        {1, 1, 1, 1, 1, 1, 1},
+    };
+
+    const int opt_count = sizeof(options) / sizeof(options[0]);
+
+    for (int i = 0; i < opt_count; i++)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = options[i][0];
+        opt.use_fp16_packed = options[i][1];
+        opt.use_fp16_storage = options[i][2];
+        opt.use_fp16_arithmetic = options[i][3];
+        opt.use_bf16_storage = options[i][4];
+        opt.use_shader_pack8 = options[i][5];
+        opt.use_image_storage = options[i][6];
+
+        int ret = test_layer_opt(layer_type, pd, weights, opt, a, top_blob_count, epsilon, func, flag);
+        if (ret != 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // pack fp16p fp16s fp16a bf16s shader8 image
+    const int options[][7] = {
+        {0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 1, 0, 0, 0, 0},
+        {0, 0, 1, 1, 1, 0, 0},
+        {1, 0, 0, 0, 0, 0, 0},
+        {1, 1, 0, 0, 1, 0, 0},
+        {1, 0, 1, 0, 0, 1, 0},
+        {1, 1, 1, 1, 0, 0, 0},
+        {1, 1, 1, 1, 1, 1, 1},
+    };
+
+    const int opt_count = sizeof(options) / sizeof(options[0]);
+
+    for (int i = 0; i < opt_count; i++)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = options[i][0];
+        opt.use_fp16_packed = options[i][1];
+        opt.use_fp16_storage = options[i][2];
+        opt.use_fp16_arithmetic = options[i][3];
+        opt.use_bf16_storage = options[i][4];
+        opt.use_shader_pack8 = options[i][5];
+        opt.use_image_storage = options[i][6];
+
+        int ret = test_layer_opt(layer_type, pd, weights, opt, a, epsilon, func, flag);
+        if (ret != 0)
+            return ret;
+    }
+
+    return 0;
+}
diff --git a/tests/testutil.h b/tests/testutil.h
index c5d1ca05d80..12f9d0daa65 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -18,1537 +18,92 @@
 #include "cpu.h"
 #include "layer.h"
 #include "mat.h"
-#include "prng.h"
 
 #include <stdio.h>
+#include <stdint.h>
 #include <stdlib.h>
 
-#if NCNN_VULKAN
-#include "command.h"
-#include "gpu.h"
-#endif // NCNN_VULKAN
-
-static struct prng_rand_t g_prng_rand_state;
-
-#define SRAND(seed) prng_srand(seed, &g_prng_rand_state)
-#define RAND()      prng_rand(&g_prng_rand_state)
-
 #define TEST_LAYER_DISABLE_AUTO_INPUT_PACKING (1 << 0)
 #define TEST_LAYER_DISABLE_AUTO_INPUT_CASTING (1 << 1)
 #define TEST_LAYER_DISABLE_GPU_TESTING        (1 << 2)
 #define TEST_LAYER_ENABLE_FORCE_INPUT_PACK8   (1 << 3)
 
-static float RandomFloat(float a = -1.2f, float b = 1.2f)
-{
-    float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
-    float diff = b - a;
-    float r = random * diff;
-    float v = a + r;
-    // generate denormal as zero
-    if (v < 0.0001 && v > -0.0001)
-        v = 0.f;
-    return v;
-}
-
-static int RandomInt(int a = -10000, int b = 10000)
-{
-    float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
-    int diff = b - a;
-    float r = random * diff;
-    return a + (int)r;
-}
-
-static signed char RandomS8()
-{
-    return (signed char)RandomInt(-127, 127);
-}
-
-static void Randomize(ncnn::Mat& m, float a = -1.2f, float b = 1.2f)
-{
-    for (size_t i = 0; i < m.total(); i++)
-    {
-        m[i] = RandomFloat(a, b);
-    }
-}
-
-static void RandomizeInt(ncnn::Mat& m, int a = -10000, int b = 10000)
-{
-    for (size_t i = 0; i < m.total(); i++)
-    {
-        ((int*)m)[i] = RandomInt(a, b);
-    }
-}
-
-static void RandomizeS8(ncnn::Mat& m)
-{
-    for (size_t i = 0; i < m.total(); i++)
-    {
-        ((signed char*)m)[i] = RandomS8();
-    }
-}
-
-static ncnn::Mat RandomMat(int w, float a = -1.2f, float b = 1.2f)
-{
-    ncnn::Mat m(w);
-    Randomize(m, a, b);
-    return m;
-}
-
-static ncnn::Mat RandomMat(int w, int h, float a = -1.2f, float b = 1.2f)
-{
-    ncnn::Mat m(w, h);
-    Randomize(m, a, b);
-    return m;
-}
-
-static ncnn::Mat RandomMat(int w, int h, int c, float a = -1.2f, float b = 1.2f)
-{
-    ncnn::Mat m(w, h, c);
-    Randomize(m, a, b);
-    return m;
-}
-
-static ncnn::Mat RandomMat(int w, int h, int d, int c, float a = -1.2f, float b = 1.2f)
-{
-    ncnn::Mat m(w, h, d, c);
-    Randomize(m, a, b);
-    return m;
-}
-
-static ncnn::Mat RandomIntMat(int w)
-{
-    ncnn::Mat m(w);
-    RandomizeInt(m);
-    return m;
-}
-
-static ncnn::Mat RandomIntMat(int w, int h)
-{
-    ncnn::Mat m(w, h);
-    RandomizeInt(m);
-    return m;
-}
-
-static ncnn::Mat RandomIntMat(int w, int h, int c)
-{
-    ncnn::Mat m(w, h, c);
-    RandomizeInt(m);
-    return m;
-}
-
-static ncnn::Mat RandomIntMat(int w, int h, int d, int c)
-{
-    ncnn::Mat m(w, h, d, c);
-    RandomizeInt(m);
-    return m;
-}
-
-static ncnn::Mat RandomS8Mat(int w)
-{
-    ncnn::Mat m(w, (size_t)1u);
-    RandomizeS8(m);
-    return m;
-}
-
-static ncnn::Mat RandomS8Mat(int w, int h)
-{
-    ncnn::Mat m(w, h, (size_t)1u);
-    RandomizeS8(m);
-    return m;
-}
-
-static ncnn::Mat RandomS8Mat(int w, int h, int c)
-{
-    ncnn::Mat m(w, h, c, (size_t)1u);
-    RandomizeS8(m);
-    return m;
-}
-
-static ncnn::Mat RandomS8Mat(int w, int h, int d, int c)
-{
-    ncnn::Mat m(w, h, d, c, (size_t)1u);
-    RandomizeS8(m);
-    return m;
-}
-
-static ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx)
-{
-    ncnn::Mat weight_scales(m);
-    for (int i = 0; i < m; ++i)
-    {
-        float min = mat[0], _max = mat[0];
-        const float* ptr = (const float*)(mat.data) + i * ldx;
-        for (int j = 0; j < k; ++j)
-        {
-            if (min > ptr[j])
-            {
-                min = ptr[j];
-            }
-            if (_max < ptr[j])
-            {
-                _max = ptr[j];
-            }
-        }
-        const float abs_min = abs(min), abs_max = abs(_max);
-        weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max);
-    }
-    return weight_scales;
-}
-
-static bool NearlyEqual(float a, float b, float epsilon)
-{
-    if (a == b)
-        return true;
-
-    float diff = (float)fabs(a - b);
-    if (diff <= epsilon)
-        return true;
-
-    // relative error
-    return diff < epsilon * std::max(fabs(a), fabs(b));
-}
-
-static int Compare(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon = 0.001)
-{
-#define CHECK_MEMBER(m)                                                                 \
-    if (a.m != b.m)                                                                     \
-    {                                                                                   \
-        fprintf(stderr, #m " not match    expect %d but got %d\n", (int)a.m, (int)b.m); \
-        return -1;                                                                      \
-    }
-
-    CHECK_MEMBER(dims)
-    CHECK_MEMBER(w)
-    CHECK_MEMBER(h)
-    CHECK_MEMBER(d)
-    CHECK_MEMBER(c)
-    CHECK_MEMBER(elemsize)
-    CHECK_MEMBER(elempack)
-
-#undef CHECK_MEMBER
-
-    for (int q = 0; q < a.c; q++)
-    {
-        const ncnn::Mat ma = a.channel(q);
-        const ncnn::Mat mb = b.channel(q);
-        for (int z = 0; z < a.d; z++)
-        {
-            const ncnn::Mat da = ma.depth(z);
-            const ncnn::Mat db = mb.depth(z);
-            for (int i = 0; i < a.h; i++)
-            {
-                const float* pa = da.row(i);
-                const float* pb = db.row(i);
-                for (int j = 0; j < a.w; j++)
-                {
-                    if (!NearlyEqual(pa[j], pb[j], epsilon))
-                    {
-                        fprintf(stderr, "value not match  at c:%d d:%d h:%d w:%d    expect %f but got %f\n", q, z, i, j, pa[j], pb[j]);
-                        return -1;
-                    }
-                }
-            }
-        }
-    }
-
-    return 0;
-}
-
-static int CompareMat(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon = 0.001)
-{
-    ncnn::Option opt;
-    opt.num_threads = 1;
-
-    if (a.elempack != 1)
-    {
-        ncnn::Mat a1;
-        ncnn::convert_packing(a, a1, 1, opt);
-        return CompareMat(a1, b, epsilon);
-    }
-
-    if (b.elempack != 1)
-    {
-        ncnn::Mat b1;
-        ncnn::convert_packing(b, b1, 1, opt);
-        return CompareMat(a, b1, epsilon);
-    }
-
-    if (a.elemsize == 2u)
-    {
-        ncnn::Mat a32;
-        cast_float16_to_float32(a, a32, opt);
-        return CompareMat(a32, b, epsilon);
-    }
-    if (a.elemsize == 1u)
-    {
-        ncnn::Mat a32;
-        cast_int8_to_float32(a, a32, opt);
-        return CompareMat(a32, b, epsilon);
-    }
-
-    if (b.elemsize == 2u)
-    {
-        ncnn::Mat b32;
-        cast_float16_to_float32(b, b32, opt);
-        return CompareMat(a, b32, epsilon);
-    }
-    if (b.elemsize == 1u)
-    {
-        ncnn::Mat b32;
-        cast_int8_to_float32(b, b32, opt);
-        return CompareMat(a, b32, epsilon);
-    }
-
-    return Compare(a, b, epsilon);
-}
-
-static int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::Mat>& b, float epsilon = 0.001)
-{
-    if (a.size() != b.size())
-    {
-        fprintf(stderr, "output blob count not match %zu %zu\n", a.size(), b.size());
-        return -1;
-    }
-
-    for (size_t i = 0; i < a.size(); i++)
-    {
-        if (CompareMat(a[i], b[i], epsilon))
-        {
-            fprintf(stderr, "output blob %zu not match\n", i);
-            return -1;
-        }
-    }
-
-    return 0;
-}
-
-template<typename T>
-int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(T*), int flag)
-{
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
-
-    if (func)
-    {
-        (*func)((T*)op);
-    }
-
-    op->load_param(pd);
-
-    if (op->one_blob_only && a.size() != 1)
-    {
-        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
-        delete op;
-        return -1;
-    }
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::Option opt;
-    opt.num_threads = 1;
-    opt.lightmode = false;
-    opt.use_packing_layout = false;
-    opt.use_fp16_packed = false;
-    opt.use_fp16_storage = false;
-    opt.use_fp16_arithmetic = false;
-    opt.use_shader_pack8 = false;
-    opt.use_image_storage = false;
-    opt.use_bf16_storage = false;
-    opt.use_vulkan_compute = false;
-
-    op->create_pipeline(opt);
-
-    b.resize(top_blob_count);
-
-    if (op->support_inplace)
-    {
-        for (size_t i = 0; i < a.size(); i++)
-        {
-            b[i] = a[i].clone();
-        }
-
-        ((T*)op)->T::forward_inplace(b, opt);
-    }
-    else
-    {
-        ((T*)op)->T::forward(a, b, opt);
-    }
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    return 0;
-}
-
-template<typename T>
-int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(T*), int flag)
-{
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
-
-    if (!op->support_packing && _opt.use_packing_layout)
-    {
-        delete op;
-        return 233;
-    }
-    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
-    {
-        delete op;
-        return 233;
-    }
-
-    if (func)
-    {
-        (*func)((T*)op);
-    }
-
-    if (!top_shapes.empty())
-    {
-        op->bottom_shapes = a;
-        op->top_shapes = top_shapes;
-    }
-
-    op->load_param(pd);
-
-    if (op->one_blob_only && a.size() != 1)
-    {
-        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
-        delete op;
-        return -1;
-    }
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::Option opt = _opt;
-    opt.num_threads = 1;
-    opt.use_vulkan_compute = false;
-
-    op->create_pipeline(opt);
-
-    if (!op->support_packing && _opt.use_packing_layout)
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
-    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
-
-    std::vector<ncnn::Mat> a4(a.size());
-
-    for (size_t i = 0; i < a4.size(); i++)
-    {
-        // clang-format off
-        // *INDENT-OFF*
-#if NCNN_ARM82
-        if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
-        }
-        else
-#endif // NCNN_ARM82
-#if NCNN_RVV
-        if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
-        }
-        else
-#endif // NCNN_RVV
-#if NCNN_BF16
-        if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_bfloat16(a[i], a4[i], opt);
-        }
-        else
-#endif // NCNN_BF16
-        if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
-        }
-        else
-        {
-            a4[i] = a[i];
-        }
-        // *INDENT-ON*
-        // clang-format on
-
-        if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
-        {
-            // resolve dst_elempack
-            int dims = a4[i].dims;
-            int elemcount = 0;
-            if (dims == 1) elemcount = a4[i].elempack * a4[i].w;
-            if (dims == 2) elemcount = a4[i].elempack * a4[i].h;
-            if (dims == 3 || dims == 4) elemcount = a4[i].elempack * a4[i].c;
-
-            int elembits = a4[i].elembits();
-
-            int dst_elempack = 1;
-
-            if (elembits == 32)
-            {
-#if NCNN_AVX512
-                if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
-                    dst_elempack = 16;
-                else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                    dst_elempack = 8;
-                else if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#elif NCNN_AVX
-                if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                    dst_elempack = 8;
-                else if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#elif NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
-                if (elemcount % packn == 0)
-                    dst_elempack = packn;
-#else
-                if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#endif
-            }
-            if (elembits == 16)
-            {
-#if NCNN_ARM82
-                if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
-                    dst_elempack = 8;
-                else if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#elif NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / 2;
-                if (elemcount % packn == 0)
-                    dst_elempack = packn;
-#else
-                if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#endif
-            }
-            if (elembits == 8)
-            {
-#if NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / 1;
-                if (elemcount % packn == 0)
-                    dst_elempack = packn;
-#else
-                if (elemcount % 8 == 0)
-                    dst_elempack = 8;
-#endif
-            }
-
-            if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
-                dst_elempack = 8;
-
-            ncnn::Mat a4_packed;
-            ncnn::convert_packing(a4[i], a4_packed, dst_elempack, opt);
-            a4[i] = a4_packed;
-        }
-    }
-
-    c.resize(top_blob_count);
-
-    if (op->support_inplace)
-    {
-        for (size_t i = 0; i < a4.size(); i++)
-        {
-            c[i] = a4[i].clone();
-        }
-
-        op->forward_inplace(c, opt);
-    }
-    else
-    {
-        op->forward(a4, c, opt);
-    }
-
-    for (size_t i = 0; i < c.size(); i++)
-    {
-        // clang-format off
-        // *INDENT-OFF*
-#if NCNN_ARM82
-        if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        else
-#endif // NCNN_ARM82
-#if NCNN_RVV
-        if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        else
-#endif // NCNN_RVV
-#if NCNN_BF16
-        if (opt.use_bf16_storage && op->support_bf16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_bfloat16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        else
-#endif // NCNN_BF16
-        if (opt.use_fp16_storage && op->support_fp16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        // *INDENT-ON*
-        // clang-format on
-    }
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    return 0;
-}
-
-#if NCNN_VULKAN
-template<typename T>
-int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(T*), int flag)
-{
-    if (!_opt.use_packing_layout)
-    {
-        // pack1 test is useless for gpu
-        return 233;
-    }
-
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
-
-    if (!op->support_vulkan)
-    {
-        delete op;
-        return 233;
-    }
-
-    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
-
-    op->vkdev = vkdev;
-
-    if (func)
-    {
-        (*func)((T*)op);
-    }
-
-    if (!top_shapes.empty())
-    {
-        op->bottom_shapes = a;
-        op->top_shapes = top_shapes;
-    }
-
-    op->load_param(pd);
-
-    if (op->one_blob_only && a.size() != 1)
-    {
-        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
-        delete op;
-        return -1;
-    }
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
-    ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
-
-    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
-    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
+void SRAND(int seed);
 
-    ncnn::Option opt = _opt;
-    opt.num_threads = 1;
-    opt.use_vulkan_compute = true;
+uint64_t RAND();
 
-#if __APPLE__
-    opt.use_image_storage = false;
-#endif
+float RandomFloat(float a = -1.2f, float b = 1.2f);
 
-    opt.blob_vkallocator = blob_vkallocator;
-    opt.workspace_vkallocator = blob_vkallocator;
-    opt.staging_vkallocator = staging_vkallocator;
+int RandomInt(int a = -10000, int b = 10000);
 
-    if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
-    if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
-    if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
-    if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
+signed char RandomS8();
 
-    // FIXME fp16a may produce large error
-    opt.use_fp16_arithmetic = false;
+void Randomize(ncnn::Mat& m, float a = -1.2f, float b = 1.2f);
 
-    op->create_pipeline(opt);
+void RandomizeInt(ncnn::Mat& m, int a = -10000, int b = 10000);
 
-    if (!op->support_vulkan)
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
+void RandomizeS8(ncnn::Mat& m);
 
-    {
-        ncnn::VkTransfer cmd(vkdev);
+ncnn::Mat RandomMat(int w, float a = -1.2f, float b = 1.2f);
 
-        ncnn::Option opt_upload = opt;
-        opt_upload.blob_vkallocator = &g_weight_vkallocator;
-        opt_upload.workspace_vkallocator = &g_weight_vkallocator;
-        opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
+ncnn::Mat RandomMat(int w, int h, float a = -1.2f, float b = 1.2f);
 
-        op->upload_model(cmd, opt_upload);
+ncnn::Mat RandomMat(int w, int h, int c, float a = -1.2f, float b = 1.2f);
 
-        cmd.submit_and_wait();
-    }
+ncnn::Mat RandomMat(int w, int h, int d, int c, float a = -1.2f, float b = 1.2f);
 
-    d.resize(top_blob_count);
+ncnn::Mat RandomIntMat(int w);
 
-    {
-        // forward
-        ncnn::VkCompute cmd(vkdev);
+ncnn::Mat RandomIntMat(int w, int h);
 
-        if (op->support_image_storage && opt.use_image_storage)
-        {
-            // upload
-            std::vector<ncnn::VkImageMat> a_gpu(a.size());
-            for (size_t i = 0; i < a_gpu.size(); i++)
-            {
-                cmd.record_upload(a[i], a_gpu[i], opt);
-            }
+ncnn::Mat RandomIntMat(int w, int h, int c);
 
-            std::vector<ncnn::VkImageMat> d_gpu(top_blob_count);
-            if (op->support_inplace)
-            {
-                op->forward_inplace(a_gpu, cmd, opt);
+ncnn::Mat RandomIntMat(int w, int h, int d, int c);
 
-                d_gpu = a_gpu;
-            }
-            else
-            {
-                op->forward(a_gpu, d_gpu, cmd, opt);
-            }
+ncnn::Mat RandomS8Mat(int w);
 
-            // download
-            for (size_t i = 0; i < d_gpu.size(); i++)
-            {
-                cmd.record_download(d_gpu[i], d[i], opt);
-            }
-        }
-        else
-        {
-            // upload
-            std::vector<ncnn::VkMat> a_gpu(a.size());
-            for (size_t i = 0; i < a_gpu.size(); i++)
-            {
-                cmd.record_upload(a[i], a_gpu[i], opt);
-            }
+ncnn::Mat RandomS8Mat(int w, int h);
 
-            std::vector<ncnn::VkMat> d_gpu(top_blob_count);
-            if (op->support_inplace)
-            {
-                op->forward_inplace(a_gpu, cmd, opt);
+ncnn::Mat RandomS8Mat(int w, int h, int c);
 
-                d_gpu = a_gpu;
-            }
-            else
-            {
-                op->forward(a_gpu, d_gpu, cmd, opt);
-            }
+ncnn::Mat RandomS8Mat(int w, int h, int d, int c);
 
-            // download
-            for (size_t i = 0; i < d_gpu.size(); i++)
-            {
-                cmd.record_download(d_gpu[i], d[i], opt);
-            }
-        }
+ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx);
 
-        cmd.submit_and_wait();
-    }
+bool NearlyEqual(float a, float b, float epsilon);
 
-    op->destroy_pipeline(opt);
+int Compare(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon = 0.001);
 
-    delete op;
+int CompareMat(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon = 0.001);
 
-    vkdev->reclaim_blob_allocator(blob_vkallocator);
-    vkdev->reclaim_staging_allocator(staging_vkallocator);
-    g_weight_vkallocator.clear();
-    g_weight_staging_vkallocator.clear();
+int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::Mat>& b, float epsilon = 0.001);
 
-    return 0;
-}
-#endif // NCNN_VULKAN
-
-template<typename T>
-int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes = std::vector<ncnn::Mat>(), float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
-{
-    // naive
-    std::vector<ncnn::Mat> b;
-    {
-        int ret = test_layer_naive(typeindex, pd, weights, a, top_blob_count, b, func, flag);
-        if (ret != 233 && ret != 0)
-        {
-            fprintf(stderr, "test_layer_naive failed\n");
-            return -1;
-        }
-    }
-
-    // cpu
-    {
-        std::vector<ncnn::Mat> c;
-        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, std::vector<ncnn::Mat>(), func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_cpu failed\n");
-            return -1;
-        }
-    }
+int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(ncnn::Layer*), int flag);
 
-    // cpu shape hint
-    {
-        std::vector<ncnn::Mat> c;
-        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, b, func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_cpu failed with shape hint\n");
-            return -1;
-        }
-    }
+int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag);
 
 #if NCNN_VULKAN
-    // gpu
-    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
-    {
-        std::vector<ncnn::Mat> d;
-        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, std::vector<ncnn::Mat>(), func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_gpu failed\n");
-            return -1;
-        }
-    }
-
-    // gpu shape hint
-    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
-    {
-        std::vector<ncnn::Mat> d;
-        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, b, func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_gpu failed with shape hint\n");
-            return -1;
-        }
-    }
-#endif // NCNN_VULKAN
-
-    return 0;
-}
-
-template<typename T>
-int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(T*), int flag)
-{
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
-
-    if (func)
-    {
-        (*func)((T*)op);
-    }
-
-    op->load_param(pd);
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::Option opt;
-    opt.num_threads = 1;
-    opt.lightmode = false;
-    opt.use_packing_layout = false;
-    opt.use_fp16_packed = false;
-    opt.use_fp16_storage = false;
-    opt.use_fp16_arithmetic = false;
-    opt.use_shader_pack8 = false;
-    opt.use_image_storage = false;
-    opt.use_bf16_storage = false;
-    opt.use_vulkan_compute = false;
-
-    op->create_pipeline(opt);
-
-    if (op->support_inplace)
-    {
-        b = a.clone();
-        ((T*)op)->T::forward_inplace(b, opt);
-    }
-    else
-    {
-        ((T*)op)->T::forward(a, b, opt);
-    }
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    return 0;
-}
-
-template<typename T>
-int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(T*), int flag)
-{
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
-
-    if (!op->support_packing && _opt.use_packing_layout)
-    {
-        delete op;
-        return 233;
-    }
-    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
-    {
-        delete op;
-        return 233;
-    }
-
-    if (func)
-    {
-        (*func)((T*)op);
-    }
-
-    if (top_shape.dims)
-    {
-        op->bottom_shapes.resize(1);
-        op->top_shapes.resize(1);
-        op->bottom_shapes[0] = a;
-        op->top_shapes[0] = top_shape;
-    }
-
-    op->load_param(pd);
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::Option opt = _opt;
-    opt.num_threads = 1;
-    opt.use_vulkan_compute = false;
-
-    op->create_pipeline(opt);
-
-    if (!op->support_packing && _opt.use_packing_layout)
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
-    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
-
-    ncnn::Mat a4;
-
-    // clang-format off
-    // *INDENT-OFF*
-#if NCNN_ARM82
-    if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::cast_float32_to_float16(a, a4, opt);
-    }
-    else
-#endif // NCNN_ARM82
-#if NCNN_RVV
-    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::cast_float32_to_float16(a, a4, opt);
-    }
-    else
-#endif // NCNN_RVV
-#if NCNN_BF16
-    if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::cast_float32_to_bfloat16(a, a4, opt);
-    }
-    else
-#endif // NCNN_BF16
-    if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::cast_float32_to_float16(a, a4, opt);
-    }
-    else
-    {
-        a4 = a;
-    }
-    // *INDENT-ON*
-    // clang-format on
-
-    if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
-    {
-        // resolve dst_elempack
-        int dims = a4.dims;
-        int elemcount = 0;
-        if (dims == 1) elemcount = a4.elempack * a4.w;
-        if (dims == 2) elemcount = a4.elempack * a4.h;
-        if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c;
-
-        int elembits = a4.elembits();
-
-        int dst_elempack = 1;
-
-        if (elembits == 32)
-        {
-#if NCNN_AVX512
-            if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
-                dst_elempack = 16;
-            else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                dst_elempack = 8;
-            else if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#elif NCNN_AVX
-            if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                dst_elempack = 8;
-            else if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#elif NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
-            if (elemcount % packn == 0)
-                dst_elempack = packn;
-#else
-            if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#endif
-        }
-        if (elembits == 16)
-        {
-#if NCNN_ARM82
-            if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
-                dst_elempack = 8;
-            else if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#elif NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / 2;
-            if (elemcount % packn == 0)
-                dst_elempack = packn;
-#else
-            if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#endif
-        }
-        if (elembits == 8)
-        {
-#if NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / 1;
-            if (elemcount % packn == 0)
-                dst_elempack = packn;
-#else
-            if (elemcount % 8 == 0)
-                dst_elempack = 8;
-#endif
-        }
-
-        if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
-            dst_elempack = 8;
-
-        ncnn::Mat a4_packed;
-        ncnn::convert_packing(a4, a4_packed, dst_elempack, opt);
-        a4 = a4_packed;
-    }
-
-    if (op->support_inplace)
-    {
-        c = a4.clone();
-        op->forward_inplace(c, opt);
-    }
-    else
-    {
-        op->forward(a4, c, opt);
-    }
-
-    // clang-format off
-    // *INDENT-OFF*
-#if NCNN_ARM82
-    if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_float16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    else
-#endif // NCNN_ARM82
-#if NCNN_RVV
-    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_float16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    else
-#endif // NCNN_RVV
-#if NCNN_BF16
-    if (opt.use_bf16_storage && op->support_bf16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_bfloat16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    else
-#endif // NCNN_BF16
-    if (opt.use_fp16_storage && op->support_fp16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_float16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    // *INDENT-ON*
-    // clang-format on
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    return 0;
-}
-
-#if NCNN_VULKAN
-template<typename T>
-int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(T*), int flag)
-{
-    if (!_opt.use_packing_layout)
-    {
-        // pack1 test is useless for gpu
-        return 233;
-    }
-
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
-
-    if (!op->support_vulkan)
-    {
-        delete op;
-        return 233;
-    }
-
-    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
-
-    op->vkdev = vkdev;
-
-    if (func)
-    {
-        (*func)((T*)op);
-    }
-
-    if (top_shape.dims)
-    {
-        op->bottom_shapes.resize(1);
-        op->top_shapes.resize(1);
-        op->bottom_shapes[0] = a;
-        op->top_shapes[0] = top_shape;
-    }
-
-    op->load_param(pd);
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
-    ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
-
-    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
-    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
-
-    ncnn::Option opt = _opt;
-    opt.num_threads = 1;
-    opt.use_vulkan_compute = true;
-
-#if __APPLE__
-    opt.use_image_storage = false;
-#endif
-
-    opt.blob_vkallocator = blob_vkallocator;
-    opt.workspace_vkallocator = blob_vkallocator;
-    opt.staging_vkallocator = staging_vkallocator;
-
-    if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
-    if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
-    if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
-    if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
-
-    // FIXME fp16a may produce large error
-    opt.use_fp16_arithmetic = false;
-
-    op->create_pipeline(opt);
-
-    if (!op->support_vulkan)
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
-
-    {
-        ncnn::VkTransfer cmd(vkdev);
-
-        ncnn::Option opt_upload = opt;
-        opt_upload.blob_vkallocator = &g_weight_vkallocator;
-        opt_upload.workspace_vkallocator = &g_weight_vkallocator;
-        opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
-
-        op->upload_model(cmd, opt_upload);
-
-        cmd.submit_and_wait();
-    }
-
-    {
-        // forward
-        ncnn::VkCompute cmd(vkdev);
-
-        if (op->support_image_storage && opt.use_image_storage)
-        {
-            // upload
-            ncnn::VkImageMat a_gpu;
-            cmd.record_upload(a, a_gpu, opt);
-
-            ncnn::VkImageMat d_gpu;
-            if (op->support_inplace)
-            {
-                op->forward_inplace(a_gpu, cmd, opt);
-
-                d_gpu = a_gpu;
-            }
-            else
-            {
-                op->forward(a_gpu, d_gpu, cmd, opt);
-            }
-
-            // download
-            cmd.record_download(d_gpu, d, opt);
-        }
-        else
-        {
-            // upload
-            ncnn::VkMat a_gpu;
-            cmd.record_upload(a, a_gpu, opt);
-
-            ncnn::VkMat d_gpu;
-            if (op->support_inplace)
-            {
-                op->forward_inplace(a_gpu, cmd, opt);
-
-                d_gpu = a_gpu;
-            }
-            else
-            {
-                op->forward(a_gpu, d_gpu, cmd, opt);
-            }
-
-            // download
-            cmd.record_download(d_gpu, d, opt);
-        }
-
-        cmd.submit_and_wait();
-    }
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    vkdev->reclaim_blob_allocator(blob_vkallocator);
-    vkdev->reclaim_staging_allocator(staging_vkallocator);
-    g_weight_vkallocator.clear();
-    g_weight_staging_vkallocator.clear();
-
-    return 0;
-}
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag);
 #endif // NCNN_VULKAN
 
-template<typename T>
-int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape = ncnn::Mat(), float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
-{
-    // naive
-    ncnn::Mat b;
-    {
-        int ret = test_layer_naive(typeindex, pd, weights, a, b, func, flag);
-        if (ret != 233 && ret != 0)
-        {
-            fprintf(stderr, "test_layer_naive failed\n");
-            return -1;
-        }
-    }
+int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes = std::vector<ncnn::Mat>(), float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
-    // cpu
-    {
-        ncnn::Mat c;
-        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, ncnn::Mat(), func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_cpu failed\n");
-            return -1;
-        }
-    }
+int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(ncnn::Layer*), int flag);
 
-    // cpu shape hint
-    {
-        ncnn::Mat c;
-        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, b, func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_cpu failed with shape hint\n");
-            return -1;
-        }
-    }
+int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag);
 
 #if NCNN_VULKAN
-    // gpu
-    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
-    {
-        ncnn::Mat d;
-        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, ncnn::Mat(), func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_gpu failed\n");
-            return -1;
-        }
-    }
-
-    // gpu shape hint
-    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
-    {
-        ncnn::Mat d;
-        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, b, func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_gpu failed with shape hint\n");
-            return -1;
-        }
-    }
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag);
 #endif // NCNN_VULKAN
 
-    return 0;
-}
-
-template<typename T>
-int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
-{
-    // fp16 representation
-    std::vector<ncnn::Mat> a_fp16;
-    if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        a_fp16.resize(a.size());
-        for (size_t j = 0; j < a.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_bfloat16(a[j], tmp, opt);
-            ncnn::cast_bfloat16_to_float32(tmp, a_fp16[j], opt);
-        }
-    }
-    else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        a_fp16.resize(a.size());
-        for (size_t j = 0; j < a.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_float16(a[j], tmp, opt);
-            ncnn::cast_float16_to_float32(tmp, a_fp16[j], opt);
-        }
-    }
-    else
-    {
-        a_fp16 = a;
-    }
-
-    std::vector<ncnn::Mat> weights_fp16;
-    float epsilon_fp16;
-    if (opt.use_bf16_storage)
-    {
-        weights_fp16.resize(weights.size());
-        for (size_t j = 0; j < weights.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
-            ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
-        }
-        epsilon_fp16 = epsilon * 100; // 0.1
-    }
-    else if (opt.use_fp16_packed || opt.use_fp16_storage)
-    {
-        weights_fp16.resize(weights.size());
-        for (size_t j = 0; j < weights.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_float16(weights[j], tmp, opt);
-            ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
-        }
-        epsilon_fp16 = epsilon * 100; // 0.1
-    }
-    else
-    {
-        weights_fp16 = weights;
-        epsilon_fp16 = epsilon;
-    }
-
-    if (opt.use_fp16_arithmetic)
-    {
-        epsilon_fp16 = epsilon * 1000; // 1.0
-    }
-
-    std::vector<ncnn::Mat> top_shapes;
-    int ret = test_layer<T>(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_blob_count, top_shapes, epsilon_fp16, func, flag);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
-        return ret;
-    }
-
-    return 0;
-}
-
-template<typename T>
-int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
-{
-    // fp16 representation
-    ncnn::Mat a_fp16;
-    if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::Mat tmp;
-        ncnn::cast_float32_to_bfloat16(a, tmp, opt);
-        ncnn::cast_bfloat16_to_float32(tmp, a_fp16, opt);
-    }
-    else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::Mat tmp;
-        ncnn::cast_float32_to_float16(a, tmp, opt);
-        ncnn::cast_float16_to_float32(tmp, a_fp16, opt);
-    }
-    else
-    {
-        a_fp16 = a;
-    }
-
-    std::vector<ncnn::Mat> weights_fp16;
-    float epsilon_fp16;
-    if (opt.use_bf16_storage)
-    {
-        weights_fp16.resize(weights.size());
-        for (size_t j = 0; j < weights.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
-            ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
-        }
-        epsilon_fp16 = epsilon * 100; // 0.1
-    }
-    else if (opt.use_fp16_packed || opt.use_fp16_storage)
-    {
-        weights_fp16.resize(weights.size());
-        for (size_t j = 0; j < weights.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_float16(weights[j], tmp, opt);
-            ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
-        }
-        epsilon_fp16 = epsilon * 100; // 0.1
-    }
-    else
-    {
-        weights_fp16 = weights;
-        epsilon_fp16 = epsilon;
-    }
-
-    if (opt.use_fp16_arithmetic)
-    {
-        epsilon_fp16 = epsilon * 1000; // 1.0
-    }
-
-    ncnn::Mat top_shape;
-    int ret = test_layer<T>(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_shape, epsilon_fp16, func, flag);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
-        return ret;
-    }
-
-    return 0;
-}
-
-template<typename T>
-int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
-{
-    // pack fp16p fp16s fp16a bf16s shader8 image
-    const int options[][7] = {
-        {0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 1, 0, 0, 0, 0},
-        {0, 0, 1, 1, 1, 0, 0},
-        {1, 0, 0, 0, 0, 0, 0},
-        {1, 1, 0, 0, 1, 0, 0},
-        {1, 0, 1, 0, 0, 1, 0},
-        {1, 1, 1, 1, 0, 0, 0},
-        {1, 1, 1, 1, 1, 1, 1},
-    };
-
-    const int opt_count = sizeof(options) / sizeof(options[0]);
-
-    for (int i = 0; i < opt_count; i++)
-    {
-        ncnn::Option opt;
-        opt.num_threads = 1;
-        opt.use_packing_layout = options[i][0];
-        opt.use_fp16_packed = options[i][1];
-        opt.use_fp16_storage = options[i][2];
-        opt.use_fp16_arithmetic = options[i][3];
-        opt.use_bf16_storage = options[i][4];
-        opt.use_shader_pack8 = options[i][5];
-        opt.use_image_storage = options[i][6];
-
-        int ret = test_layer_opt<T>(layer_type, pd, weights, opt, a, top_blob_count, epsilon, func, flag);
-        if (ret != 0)
-            return ret;
-    }
-
-    return 0;
-}
-
-template<typename T>
-int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
-{
-    // pack fp16p fp16s fp16a bf16s shader8 image
-    const int options[][7] = {
-        {0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 1, 0, 0, 0, 0},
-        {0, 0, 1, 1, 1, 0, 0},
-        {1, 0, 0, 0, 0, 0, 0},
-        {1, 1, 0, 0, 1, 0, 0},
-        {1, 0, 1, 0, 0, 1, 0},
-        {1, 1, 1, 1, 0, 0, 0},
-        {1, 1, 1, 1, 1, 1, 1},
-    };
+int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape = ncnn::Mat(), float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
-    const int opt_count = sizeof(options) / sizeof(options[0]);
+int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
-    for (int i = 0; i < opt_count; i++)
-    {
-        ncnn::Option opt;
-        opt.num_threads = 1;
-        opt.use_packing_layout = options[i][0];
-        opt.use_fp16_packed = options[i][1];
-        opt.use_fp16_storage = options[i][2];
-        opt.use_fp16_arithmetic = options[i][3];
-        opt.use_bf16_storage = options[i][4];
-        opt.use_shader_pack8 = options[i][5];
-        opt.use_image_storage = options[i][6];
+int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
-        int ret = test_layer_opt<T>(layer_type, pd, weights, opt, a, epsilon, func, flag);
-        if (ret != 0)
-            return ret;
-    }
+int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
-    return 0;
-}
+int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
 #endif // TESTUTIL_H
diff --git a/tools/pnnx/CMakeLists.txt b/tools/pnnx/CMakeLists.txt
index 0c8326fc942..7ef60bdca33 100644
--- a/tools/pnnx/CMakeLists.txt
+++ b/tools/pnnx/CMakeLists.txt
@@ -25,35 +25,18 @@ include(PNNXPyTorch)
 # c++14 is required for using torch headers
 set(CMAKE_CXX_STANDARD 14)
 
-#set(CMAKE_BUILD_TYPE debug)
-#set(CMAKE_BUILD_TYPE relwithdebinfo)
-#set(CMAKE_BUILD_TYPE release)
+# set(CMAKE_BUILD_TYPE debug)
+# set(CMAKE_BUILD_TYPE relwithdebinfo)
+# set(CMAKE_BUILD_TYPE release)
 
 option(PNNX_COVERAGE "build for coverage" OFF)
 
-#set(Torch_INSTALL_DIR "/home/nihui/.local/lib/python3.9/site-packages/torch" CACHE STRING "")
-#set(Torch_INSTALL_DIR "/home/nihui/osd/pnnx/pytorch-v1.10.0/build/install" CACHE STRING "")
-#set(Torch_INSTALL_DIR "/home/nihui/osd/pnnx/libtorch" CACHE STRING "")
-set(TorchVision_INSTALL_DIR "/home/nihui/osd/vision/build/install" CACHE STRING "")
+# set(Torch_INSTALL_DIR "/home/nihui/osd/pnnx/install" CACHE STRING "")
+# set(TorchVision_INSTALL_DIR "/home/nihui/osd/pnnx/install" CACHE STRING "")
 
 #set(Torch_DIR "${Torch_INSTALL_DIR}/share/cmake/Torch")
 set(TorchVision_DIR "${TorchVision_INSTALL_DIR}/share/cmake/TorchVision")
 
-find_package(protobuf CONFIG)
-
-if(protobuf_FOUND)
-    set(PROTOBUF_FOUND ${protobuf_FOUND})
-    set(PROTOBUF_VERSION ${protobuf_VERSION})
-else()
-    # fallback to system
-    find_package(Protobuf)
-    set(PROTOBUF_FOUND ${Protobuf_FOUND})
-    set(PROTOBUF_VERSION ${Protobuf_VERSION})
-    if(TARGET protobuf::protoc)
-        set_target_properties(protobuf::protoc PROPERTIES IMPORTED_LOCATION_RELEASE "${PROTOBUF_PROTOC_EXECUTABLE}")
-    endif()
-endif()
-
 find_package(Python3 COMPONENTS Interpreter Development)
 
 PNNXProbeForPyTorchInstall()
@@ -75,15 +58,63 @@ if(Torch_VERSION VERSION_GREATER_EQUAL "2.1")
     set(CMAKE_CXX_STANDARD 17)
 endif()
 
-if(TorchVision_FOUND)
+# find torchvision library
+find_library(TORCHVISION_LIBRARY torchvision PATHS "${TorchVision_INSTALL_DIR}/lib" "${TorchVision_INSTALL_DIR}/lib64")
+if(TORCHVISION_LIBRARY)
+    message(STATUS "Found TorchVision: ${TORCHVISION_LIBRARY}")
+    if(APPLE)
+        set(TORCHVISION_LIBRARY "-Wl,-force_load,${TORCHVISION_LIBRARY}")
+    elseif(MSVC)
+        set(TORCHVISION_LIBRARY "-WHOLEARCHIVE:${TORCHVISION_LIBRARY}")
+    else()
+        set(TORCHVISION_LIBRARY "-Wl,--whole-archive ${TORCHVISION_LIBRARY} -Wl,--no-whole-archive")
+    endif()
+    set(TorchVision_FOUND TRUE)
     message(STATUS "Building with TorchVision")
-    add_definitions(-DPNNX_TORCHVISION)
 else()
+    message(WARNING "static library ${TORCHVISION_LIBRARY} not found.")
+    set(TorchVision_FOUND FALSE)
     message(WARNING "Building without TorchVision")
 endif()
 
 include_directories(${TORCH_INCLUDE_DIRS})
 
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+    # test if libtorch and protobuf has the same cxxabi version
+    include(CheckCXXSourceCompiles)
+    set(CMAKE_REQUIRED_FLAGS "${TORCH_CXX_FLAGS}")
+    check_cxx_source_compiles("#include <cxxabi.h>\n#if _GLIBCXX_USE_CXX11_ABI\nint main() { return 0; }\n#endif" PNNX_TORCH_USE_CXX11_ABI)
+    unset(CMAKE_REQUIRED_FLAGS)
+    check_cxx_source_compiles("#include <cxxabi.h>\n#if _GLIBCXX_USE_CXX11_ABI\nint main() { return 0; }\n#endif" PNNX_COMPILER_USE_CXX11_ABI)
+endif()
+
+if((PNNX_TORCH_USE_CXX11_ABI AND PNNX_COMPILER_USE_CXX11_ABI) OR (NOT PNNX_TORCH_USE_CXX11_ABI AND NOT PNNX_COMPILER_USE_CXX11_ABI))
+    find_package(protobuf CONFIG)
+
+    if(protobuf_FOUND)
+        set(PROTOBUF_FOUND ${protobuf_FOUND})
+        set(PROTOBUF_VERSION ${protobuf_VERSION})
+    else()
+        # fallback to system
+        find_package(Protobuf)
+        set(PROTOBUF_FOUND ${Protobuf_FOUND})
+        set(PROTOBUF_VERSION ${Protobuf_VERSION})
+        if(TARGET protobuf::protoc)
+            set_target_properties(protobuf::protoc PROPERTIES IMPORTED_LOCATION_RELEASE "${PROTOBUF_PROTOC_EXECUTABLE}")
+        endif()
+    endif()
+endif()
+
+# set(onnxruntime_INSTALL_DIR "/home/nihui/osd/ncnn-nihui/tools/pnnx/build/src/test/onnxruntime-1.16.3/build/install" CACHE STRING "")
+# set(onnxruntime_DIR "${onnxruntime_INSTALL_DIR}/lib64/cmake/onnxruntime")
+#
+# find_package(onnxruntime)
+#
+# message(STATUS "onnxruntime_VERSION = ${onnxruntime_VERSION}")
+# message(STATUS "onnxruntime_VERSION_MAJOR = ${onnxruntime_VERSION_MAJOR}")
+# message(STATUS "onnxruntime_VERSION_MINOR = ${onnxruntime_VERSION_MINOR}")
+# message(STATUS "onnxruntime_VERSION_PATCH = ${onnxruntime_VERSION_PATCH}")
+
 add_subdirectory(src)
 
 enable_testing()
diff --git a/tools/pnnx/python/README.md b/tools/pnnx/python/README.md
index 964d8fa53ff..4aabf9c699f 100644
--- a/tools/pnnx/python/README.md
+++ b/tools/pnnx/python/README.md
@@ -42,7 +42,7 @@ conda install pytorch
 ```
 3. install
 ```bash
-cd /pathto/ncnntools/pnnx
+cd /pathto/ncnntools/pnnx/python
 python setup.py install
 ```
 
@@ -61,8 +61,8 @@ python setup.py install
 
 ## Tests
 ```bash
-cd /pathto/ncnn/tools/pnnx
-pytest python/tests/
+cd /pathto/ncnn/tools/pnnx/python
+pytest tests
 ```
 
 ## Usage
diff --git a/tools/pnnx/python/pnnx/__init__.py b/tools/pnnx/python/pnnx/__init__.py
index 0ba69277d7a..e33f5c55abe 100644
--- a/tools/pnnx/python/pnnx/__init__.py
+++ b/tools/pnnx/python/pnnx/__init__.py
@@ -13,7 +13,10 @@
 # specific language governing permissions and limitations under the License.
 
 import os
+import sys
 import platform
+import subprocess
+
 EXEC_DIR_PATH = os.path.dirname(os.path.abspath(__file__))
 if platform.system() == 'Linux' or platform.system() == "Darwin":
     EXEC_PATH = EXEC_DIR_PATH + "/pnnx"
@@ -31,3 +34,5 @@
 except:
     pass
 
+def pnnx():
+    raise SystemExit(subprocess.call([EXEC_PATH] + sys.argv[1:], close_fds=False))
diff --git a/tools/pnnx/python/pnnx/utils/convert.py b/tools/pnnx/python/pnnx/utils/convert.py
index 380655ba001..6e5f78cc644 100644
--- a/tools/pnnx/python/pnnx/utils/convert.py
+++ b/tools/pnnx/python/pnnx/utils/convert.py
@@ -21,7 +21,7 @@
 def convert(ptpath, inputs = None, inputs2 = None, input_shapes = None, input_types = None,
             input_shapes2 = None, input_types2 = None, device = None, customop = None,
             moduleop = None, optlevel = None, pnnxparam = None, pnnxbin = None,
-            pnnxpy = None, pnnxonnx = None, ncnnparam = None, ncnnbin = None, ncnnpy = None):
+            pnnxpy = None, pnnxonnx = None, ncnnparam = None, ncnnbin = None, ncnnpy = None, fp16 = True):
 
     check_type(ptpath, "modelname", [str], "str")
     check_type(inputs, "inputs", [torch.Tensor, tuple, list], "torch.Tensor or tuple/list of torch.Tensor")
@@ -106,6 +106,8 @@ def convert(ptpath, inputs = None, inputs2 = None, input_shapes = None, input_ty
             command_list.append("ncnnbin=" + ncnnbin)
         if not (ncnnpy is None):
             command_list.append("ncnnpy=" + ncnnpy)
+        if not (fp16 is True):
+            command_list.append("fp16=0")
         current_dir = os.getcwd()
         subprocess.run(command_list, stdout=subprocess.PIPE, text=True, cwd=current_dir)
 
diff --git a/tools/pnnx/python/pnnx/utils/export.py b/tools/pnnx/python/pnnx/utils/export.py
index 3a04dae733d..6e24954efcd 100644
--- a/tools/pnnx/python/pnnx/utils/export.py
+++ b/tools/pnnx/python/pnnx/utils/export.py
@@ -19,7 +19,7 @@ def export(model, ptpath, inputs = None, inputs2 = None, input_shapes = None, in
            input_shapes2 = None, input_types2 = None, device = None, customop = None,
            moduleop = None, optlevel = None, pnnxparam = None, pnnxbin = None,
            pnnxpy = None, pnnxonnx = None, ncnnparam = None, ncnnbin = None, ncnnpy = None,
-           check_trace=True):
+           check_trace = True, fp16 = True):
     if (inputs is None) and (input_shapes is None):
         raise Exception("inputs or input_shapes should be specified.")
     if not (input_shapes is None) and (input_types is None):
@@ -30,4 +30,4 @@ def export(model, ptpath, inputs = None, inputs2 = None, input_shapes = None, in
     mod.save(ptpath)
 
     from . import convert
-    return convert(ptpath, inputs, inputs2, input_shapes, input_types, input_shapes2, input_types2, device, customop, moduleop, optlevel, pnnxparam, pnnxbin, pnnxpy, pnnxonnx, ncnnparam, ncnnbin, ncnnpy)
+    return convert(ptpath, inputs, inputs2, input_shapes, input_types, input_shapes2, input_types2, device, customop, moduleop, optlevel, pnnxparam, pnnxbin, pnnxpy, pnnxonnx, ncnnparam, ncnnbin, ncnnpy, fp16)
diff --git a/tools/pnnx/setup.py b/tools/pnnx/python/setup.py
similarity index 96%
rename from tools/pnnx/setup.py
rename to tools/pnnx/python/setup.py
index dcbb99922b3..bb3ca546ed1 100644
--- a/tools/pnnx/setup.py
+++ b/tools/pnnx/python/setup.py
@@ -156,10 +156,11 @@ def build_extension(self, ext):
     ],
     license="BSD-3",
     python_requires=">=3.7",
-    packages=find_packages("python"),
+    packages=find_packages(),
     package_data={"pnnx": ["pnnx", "pnnx.exe"]},
-    package_dir={"": "python"},
+    package_dir={"": "."},
     install_requires=requirements,
-    ext_modules=[CMakeExtension("pnnx")],
+    ext_modules=None if PNNX_WHEEL_WITHOUT_BUILD == 'ON' else [CMakeExtension("pnnx", "..")],
     cmdclass={"build_ext": CMakeBuild},
+    entry_points={"console_scripts": ["pnnx=pnnx:pnnx"]},
 )
diff --git a/tools/pnnx/python/setup.py.i b/tools/pnnx/python/setup.py.i
deleted file mode 100644
index 5c3c72e876d..00000000000
--- a/tools/pnnx/python/setup.py.i
+++ /dev/null
@@ -1,45 +0,0 @@
-import sys
-from setuptools import setup, find_packages
-
-try:
-    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
-
-    class bdist_wheel(_bdist_wheel):
-        def finalize_options(self):
-            _bdist_wheel.finalize_options(self)
-            self.root_is_pure = False
-
-except ImportError:
-    bdist_wheel = None
-
-if sys.version_info < (3, 0):
-    sys.exit("Sorry, Python < 3.0 is not supported")
-
-requirements = ["torch"]
-
-setup(
-        name="pnnx",
-        version="${PACKAGE_VERSION}",
-        author="nihui",
-        author_email="nihuini@tencent.com",
-        description="pnnx is an open standard for PyTorch model interoperability.",
-        url="https://github.com/Tencent/ncnn/tree/master/tools/pnnx",
-        classifiers=[
-            "Programming Language :: C++",
-            "Programming Language :: Python :: 3",
-            "Programming Language :: Python :: 3.7",
-            "Programming Language :: Python :: 3.8",
-            "Programming Language :: Python :: 3.9",
-            "Programming Language :: Python :: 3.10",
-            "Programming Language :: Python :: 3.11",
-            "License :: OSI Approved :: BSD License",
-            "Operating System :: OS Independent",
-        ],
-        license="BSD-3",
-        python_requires=">=3.6",
-        packages=find_packages(),
-        package_dir={"": "."},
-        package_data={"pnnx": ["pnnx${PYTHON_MODULE_PREFIX}${PYTHON_MODULE_EXTENSION}"]},
-        install_requires=requirements,
-        cmdclass={"bdist_wheel": bdist_wheel},
-)
\ No newline at end of file
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index 58264dfd975..cd27ceb2ea8 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -236,6 +236,7 @@ set(pnnx_pass_level2_SRCS
     pass_level2/torch_lgamma.cpp
     pass_level2/torch_logsumexp.cpp
     pass_level2/torch_lt.cpp
+    pass_level2/torch_masked_select.cpp
     pass_level2/torch_matmul.cpp
     pass_level2/torch_max.cpp
     pass_level2/torch_mean.cpp
@@ -257,6 +258,7 @@ set(pnnx_pass_level2_SRCS
     pass_level2/torch_repeat_interleave.cpp
     pass_level2/torch_roll.cpp
     pass_level2/torch_scatter_add.cpp
+    pass_level2/torch_slice_scatter.cpp
     pass_level2/torch_split.cpp
     pass_level2/torch_squeeze.cpp
     pass_level2/torch_stack.cpp
@@ -364,6 +366,7 @@ set(pnnx_pass_level5_SRCS
     pass_level5/fuse_static_batchnorm.cpp
     pass_level5/fuse_static_conv.cpp
     pass_level5/fuse_static_convtranspose.cpp
+    pass_level5/fuse_static_embedding.cpp
     pass_level5/fuse_static_groupnorm.cpp
     pass_level5/fuse_static_instancenorm.cpp
     pass_level5/fuse_static_layernorm.cpp
@@ -386,6 +389,8 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/convert_torch_tensor_split.cpp
     pass_ncnn/convert_torch_unbind.cpp
     pass_ncnn/convert_Tensor_select.cpp
+    pass_ncnn/convert_Tensor_slice.cpp
+    pass_ncnn/convert_Tensor_slice_copy.cpp
     pass_ncnn/eliminate_output.cpp
     pass_ncnn/expand_expression.cpp
     pass_ncnn/fuse_convert_shufflechannel_slice.cpp
@@ -535,8 +540,6 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/Tensor_contiguous.cpp
     pass_ncnn/Tensor_reshape.cpp
     pass_ncnn/Tensor_repeat.cpp
-    pass_ncnn/Tensor_slice.cpp
-    pass_ncnn/Tensor_slice_copy.cpp
     pass_ncnn/Tensor_view.cpp
     pass_ncnn/torch_addmm.cpp
     pass_ncnn/torch_amax.cpp
@@ -556,6 +559,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/torch_norm.cpp
     pass_ncnn/torch_permute.cpp
     pass_ncnn/torch_prod.cpp
+    pass_ncnn/torch_slice_scatter.cpp
     pass_ncnn/torch_squeeze.cpp
     pass_ncnn/torch_sum.cpp
     pass_ncnn/torch_t.cpp
@@ -571,52 +575,104 @@ if(PROTOBUF_FOUND)
 
     if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE)
         protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx.proto)
+        add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
+        target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+        target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES})
+    else()
+        add_library(onnxproto STATIC onnx.proto)
+        target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+        protobuf_generate(TARGET onnxproto)
+        target_link_libraries(onnxproto PUBLIC protobuf::libprotobuf)
+    endif()
+endif()
 
-        add_library(pnnx2onnx STATIC
-            save_onnx.cpp
-            save_onnx_cxxabi_bridge.cpp
-            ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS}
-        )
+set(torch2pnnx_SRCS
+
+    pass_level0.cpp
+    pass_level1.cpp
 
-        target_include_directories(pnnx2onnx PRIVATE ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
-        target_link_libraries(pnnx2onnx PRIVATE ${PROTOBUF_LIBRARIES})
+    ${pnnx_pass_level0_SRCS}
+    ${pnnx_pass_level1_SRCS}
 
-        # libtorch is usually compiled with old cxx11 abi
-        set_source_files_properties(save_onnx_cxxabi_bridge.cpp PROPERTIES COMPILE_FLAGS "${TORCH_CXX_FLAGS}")
+    load_torchscript.cpp
+)
 
-        message(STATUS "Building with onnx-zero")
-    else()
+add_library(torch2pnnx OBJECT ${torch2pnnx_SRCS})
+target_compile_definitions(torch2pnnx PRIVATE BUILD_TORCH2PNNX)
+target_compile_options(torch2pnnx PUBLIC "${TORCH_CXX_FLAGS}")
 
-        add_library(pnnx2onnx STATIC
-            save_onnx.cpp
-            save_onnx_cxxabi_bridge.cpp
-            onnx.proto
-        )
-        set_source_files_properties(save_onnx_cxxabi_bridge.cpp PROPERTIES COMPILE_FLAGS "${TORCH_CXX_FLAGS}")
-        target_include_directories(pnnx2onnx PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-        protobuf_generate(TARGET pnnx2onnx)
-        target_link_libraries(pnnx2onnx PRIVATE protobuf::libprotobuf)
-        message(STATUS "Building with onnx-zero")
-    endif()
+if(TorchVision_FOUND)
+    set_property(SOURCE load_torchscript.cpp APPEND PROPERTY COMPILE_DEFINITIONS PNNX_TORCHVISION)
+endif()
+
+if(PROTOBUF_FOUND)
+    add_library(pnnx2onnx STATIC
+        save_onnx.cpp
+    )
+    target_link_libraries(pnnx2onnx PRIVATE onnxproto)
+
+    message(STATUS "Building with onnx-zero")
 else()
     message(STATUS "Building without onnx-zero")
 endif()
 
+# if(onnxruntime_FOUND)
+#
+#     set(pnnx_pass_onnx_SRCS
+#         pass_onnx/canonicalize.cpp
+#         pass_onnx/dead_code_elimination.cpp
+#         pass_onnx/eliminate_noop.cpp
+#         pass_onnx/fold_constants.cpp
+#         pass_onnx/inline_containers.cpp
+#         pass_onnx/model_stat.cpp
+#         pass_onnx/shape_inference.cpp
+#
+#         pass_onnx/nn_AdaptiveAvgPool2d.cpp
+#         pass_onnx/nn_AdaptiveAvgPool3d.cpp
+#         pass_onnx/nn_AvgPool2d.cpp
+#         pass_onnx/nn_AvgPool3d.cpp
+#         pass_onnx/nn_BatchNorm2d.cpp
+#         pass_onnx/nn_BatchNorm3d.cpp
+#         pass_onnx/nn_Conv2d.cpp
+#         pass_onnx/nn_Conv3d.cpp
+#         pass_onnx/nn_GELU.cpp
+#         pass_onnx/nn_LayerNorm.cpp
+#         pass_onnx/nn_Linear.cpp
+#         pass_onnx/nn_MaxPool2d.cpp
+#         pass_onnx/nn_MaxPool3d.cpp
+#         pass_onnx/nn_MultiheadAttention.cpp
+#     )
+#
+#     set(onnx2pnnx_SRCS
+#         pass_onnx.cpp
+#         ${pnnx_pass_onnx_SRCS}
+#         load_onnx.cpp
+#     )
+#
+#     add_library(onnx2pnnx OBJECT ${onnx2pnnx_SRCS})
+#     target_link_libraries(onnx2pnnx PRIVATE onnxproto onnxruntime::onnxruntime)
+#     target_compile_definitions(onnx2pnnx PRIVATE BUILD_ONNX2PNNX)
+#
+#     message(STATUS "Building with dynamo-onnx")
+# else()
+#     message(STATUS "Building without dynamo-onnx")
+# endif()
+
+if(NOT MSVC)
+    add_definitions(-Wall -Wextra)
+endif()
+
 set(pnnx_SRCS
     main.cpp
     ir.cpp
     storezip.cpp
     utils.cpp
 
-    pass_level0.cpp
-    pass_level1.cpp
     pass_level2.cpp
     pass_level3.cpp
     pass_level4.cpp
     pass_level5.cpp
 
-    ${pnnx_pass_level0_SRCS}
-    ${pnnx_pass_level1_SRCS}
     ${pnnx_pass_level2_SRCS}
     ${pnnx_pass_level3_SRCS}
     ${pnnx_pass_level4_SRCS}
@@ -627,40 +683,42 @@ set(pnnx_SRCS
     ${pnnx_pass_ncnn_SRCS}
 )
 
-if(NOT MSVC)
-    add_definitions(-Wall -Wextra)
-endif()
-
 add_executable(pnnx ${pnnx_SRCS})
 
-target_compile_definitions(pnnx PRIVATE BUILD_PNNX)
+set_property(SOURCE main.cpp APPEND PROPERTY COMPILE_DEFINITIONS BUILD_TORCH2PNNX)
+target_link_libraries(pnnx PRIVATE torch2pnnx)
 
-if(PNNX_COVERAGE)
-    target_compile_options(pnnx PUBLIC -coverage -fprofile-arcs -ftest-coverage)
-    target_link_libraries(pnnx PUBLIC -coverage -lgcov)
+if(TorchVision_FOUND)
+    target_link_libraries(pnnx PRIVATE ${TORCHVISION_LIBRARY})
 endif()
 
 if(WIN32)
-    target_compile_definitions(pnnx PUBLIC NOMINMAX)
+    target_link_libraries(pnnx PRIVATE ${TORCH_LIBRARIES})
+else()
+    target_link_libraries(pnnx PRIVATE ${TORCH_LIBRARIES} pthread dl)
 endif()
 
 if(PROTOBUF_FOUND)
-    target_compile_definitions(pnnx PRIVATE BUILD_PNNX2ONNX)
+    set_property(SOURCE main.cpp APPEND PROPERTY COMPILE_DEFINITIONS BUILD_PNNX2ONNX)
     target_link_libraries(pnnx PRIVATE pnnx2onnx)
 endif()
 
-if(TorchVision_FOUND)
-    target_link_libraries(pnnx PRIVATE TorchVision::TorchVision)
+# if(onnxruntime_FOUND)
+#     set_property(SOURCE main.cpp APPEND PROPERTY COMPILE_DEFINITIONS BUILD_ONNX2PNNX)
+#     target_link_libraries(pnnx PRIVATE onnx2pnnx)
+# endif()
+
+if(PNNX_COVERAGE)
+    target_compile_options(pnnx PUBLIC -coverage -fprofile-arcs -ftest-coverage)
+    target_link_libraries(pnnx PUBLIC -coverage -lgcov)
 endif()
 
 if(WIN32)
-    target_link_libraries(pnnx PRIVATE ${TORCH_LIBRARIES})
-else()
-    target_link_libraries(pnnx PRIVATE ${TORCH_LIBRARIES} pthread dl)
+    target_compile_definitions(pnnx PUBLIC NOMINMAX)
 endif()
 
-#set_target_properties(pnnx PROPERTIES COMPILE_FLAGS -fsanitize=address)
-#set_target_properties(pnnx PROPERTIES LINK_FLAGS -fsanitize=address)
+# set_target_properties(pnnx PROPERTIES COMPILE_FLAGS -fsanitize=address)
+# set_target_properties(pnnx PROPERTIES LINK_FLAGS -fsanitize=address)
 
 if(APPLE)
     set_target_properties(pnnx PROPERTIES INSTALL_RPATH "@executable_path/")
diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index e0e48fbae89..68de54ef8b2 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -23,11 +23,6 @@
 #include <string>
 #include <stack>
 
-#if BUILD_PNNX
-#include <torch/script.h>
-#include <torch/csrc/api/include/torch/version.h>
-#endif
-
 #include "storezip.h"
 #include "utils.h"
 
@@ -135,242 +130,6 @@ static int string_to_type(const char* s)
     return 0; // null
 }
 
-#if BUILD_PNNX
-int get_at_tensor_type(const at::ScalarType& st)
-{
-    if (st == c10::ScalarType::Float) return 1;
-    if (st == c10::ScalarType::Double) return 2;
-    if (st == c10::ScalarType::Half) return 3;
-    if (st == c10::ScalarType::Int) return 4;
-    if (st == c10::ScalarType::QInt32) return 4;
-    if (st == c10::ScalarType::Long) return 5;
-    if (st == c10::ScalarType::Short) return 6;
-    if (st == c10::ScalarType::Char) return 7;
-    if (st == c10::ScalarType::QInt8) return 7;
-    if (st == c10::ScalarType::Byte) return 8;
-    if (st == c10::ScalarType::QUInt8) return 8;
-    if (st == c10::ScalarType::Bool) return 9;
-    if (st == c10::ScalarType::ComplexFloat) return 10;
-    if (st == c10::ScalarType::ComplexDouble) return 11;
-    if (st == c10::ScalarType::ComplexHalf) return 12;
-    return 0; // unknown type
-}
-
-Parameter::Parameter(const torch::jit::Node* value_node)
-{
-    type = 0;
-
-    if (value_node->kind() == c10::prim::Constant)
-    {
-        if (value_node->output()->type()->kind() == c10::TypeKind::NoneType)
-        {
-            type = 0;
-            return;
-        }
-
-        if (!value_node->hasAttribute(torch::jit::attr::value))
-        {
-            fprintf(stderr, "no attribute value\n");
-            value_node->dump();
-            return;
-        }
-
-        switch (value_node->output()->type()->kind())
-        {
-        case c10::TypeKind::NoneType:
-        {
-            type = 0;
-            break;
-        }
-        case c10::TypeKind::BoolType:
-        {
-            type = 1;
-            b = value_node->i(torch::jit::attr::value);
-            break;
-        }
-        case c10::TypeKind::IntType:
-        {
-            type = 2;
-            int64_t i64 = value_node->i(torch::jit::attr::value);
-            if (i64 == std::numeric_limits<int64_t>::max()) i64 = INT_MAX;
-            if (i64 == std::numeric_limits<int64_t>::min()) i64 = INT_MIN;
-            i = (int)i64;
-            break;
-        }
-        case c10::TypeKind::FloatType:
-        {
-            type = 3;
-            f = (float)value_node->f(torch::jit::attr::value);
-            break;
-        }
-        case c10::TypeKind::StringType:
-        {
-            type = 4;
-            s = value_node->s(torch::jit::attr::value);
-            break;
-        }
-        case c10::TypeKind::DeviceObjType:
-        {
-            type = 4;
-            s = value_node->s(torch::jit::attr::value);
-            break;
-        }
-#if TORCH_VERSION_MAJOR >= 2 || (TORCH_VERSION_MAJOR >= 1 && TORCH_VERSION_MINOR >= 9)
-        case c10::TypeKind::ComplexType:
-        {
-            type = 10;
-            c = std::complex<float>(value_node->c(torch::jit::attr::value));
-            break;
-        }
-#endif
-        case c10::TypeKind::TensorType:
-        {
-            at::Tensor t = value_node->t(torch::jit::attr::value);
-
-            if (t.dim() == 0)
-            {
-                if (t.scalar_type() == c10::ScalarType::Long)
-                {
-                    type = 2;
-                    int64_t i64 = t.item<int64_t>();
-                    if (i64 == std::numeric_limits<int64_t>::max()) i64 = INT_MAX;
-                    if (i64 == std::numeric_limits<int64_t>::min()) i64 = INT_MIN;
-                    i = (int)i64;
-                }
-                else if (t.scalar_type() == c10::ScalarType::Int)
-                {
-                    type = 2;
-                    i = t.item<int>();
-                }
-                else if (t.scalar_type() == c10::ScalarType::Double)
-                {
-                    type = 3;
-                    f = (float)t.item<double>();
-                }
-                else if (t.scalar_type() == c10::ScalarType::Float)
-                {
-                    type = 3;
-                    f = t.item<float>();
-                }
-                else if (t.scalar_type() == c10::ScalarType::ComplexDouble)
-                {
-                    type = 10;
-                    c = std::complex<float>(t.item<c10::complex<double> >());
-                }
-                else if (t.scalar_type() == c10::ScalarType::ComplexFloat)
-                {
-                    type = 10;
-                    c = std::complex<float>(t.item<c10::complex<float> >());
-                }
-                else
-                {
-                    fprintf(stderr, "unknown Parameter value kind %s of TensorType, t.dim = 0\n", value_node->kind().toDisplayString());
-                }
-            }
-            else
-            {
-                // constant tensor will become pnnx attribute node later
-                type = 8;
-            }
-
-            break;
-        }
-        default:
-        {
-            fprintf(stderr, "unknown Parameter value kind %s\n", c10::typeKindToString(value_node->output()->type()->kind()));
-            break;
-        }
-        }
-    }
-    else if (value_node->kind() == c10::prim::ListConstruct)
-    {
-        switch (value_node->output()->type()->cast<c10::ListType>()->getElementType()->kind())
-        {
-        case c10::TypeKind::IntType:
-        {
-            type = 5;
-            for (const auto& x : value_node->inputs())
-            {
-                if (!x->node()->hasAttribute(torch::jit::attr::value))
-                {
-                    fprintf(stderr, "no attribute value in int list\n");
-                    ai.push_back(0);
-                    continue;
-                }
-
-                ai.push_back((int)x->node()->i(torch::jit::attr::value));
-            }
-            break;
-        }
-        case c10::TypeKind::FloatType:
-        {
-            type = 6;
-            for (const auto& x : value_node->inputs())
-            {
-                if (!x->node()->hasAttribute(torch::jit::attr::value))
-                {
-                    fprintf(stderr, "no attribute value in float list\n");
-                    af.push_back(0.f);
-                    continue;
-                }
-
-                af.push_back((float)x->node()->f(torch::jit::attr::value));
-            }
-            break;
-        }
-        case c10::TypeKind::StringType:
-        {
-            type = 7;
-            for (const auto& x : value_node->inputs())
-            {
-                if (!x->node()->hasAttribute(torch::jit::attr::value))
-                {
-                    fprintf(stderr, "no attribute value in string list\n");
-                    as.push_back("");
-                    continue;
-                }
-
-                as.push_back(x->node()->s(torch::jit::attr::value));
-            }
-            break;
-        }
-#if TORCH_VERSION_MAJOR >= 2 || (TORCH_VERSION_MAJOR >= 1 && TORCH_VERSION_MINOR >= 9)
-        case c10::TypeKind::ComplexType:
-        {
-            type = 11;
-            for (const auto& x : value_node->inputs())
-            {
-                if (!x->node()->hasAttribute(torch::jit::attr::value))
-                {
-                    fprintf(stderr, "no attribute value in complex list\n");
-                    ac.push_back(std::complex<float>(0.f, 0.f));
-                    continue;
-                }
-
-                ac.push_back(std::complex<float>(x->node()->c(torch::jit::attr::value)));
-            }
-            break;
-        }
-#endif
-        default:
-        {
-            fprintf(stderr, "unknown Parameter value list element kind %s\n", c10::typeKindToString(value_node->output()->type()->cast<c10::ListType>()->getElementType()->kind()));
-            break;
-        }
-        }
-    }
-    else
-    {
-        fprintf(stderr, "unknown Parameter value_node kind %s\n", value_node->kind().toDisplayString());
-    }
-}
-
-Parameter::Parameter(const torch::jit::Value* value)
-    : Parameter(value->node())
-{
-}
-#endif // BUILD_PNNX
-
 bool operator==(const Parameter& lhs, const Parameter& rhs)
 {
     if (lhs.type != rhs.type)
@@ -409,59 +168,6 @@ bool operator==(const Parameter& lhs, const Parameter& rhs)
     return false;
 }
 
-#if BUILD_PNNX
-Attribute::Attribute(const at::Tensor& t)
-{
-    type = get_at_tensor_type(t.scalar_type());
-
-    const int ndim = (int)t.dim();
-
-    if (ndim == 0)
-    {
-        shape = {1};
-
-        data.resize(type_to_elemsize(type));
-
-        if (t.scalar_type() == c10::ScalarType::Long)
-        {
-            int64_t i = t.item<int64_t>();
-            memcpy((void*)data.data(), (const void*)&i, data.size());
-        }
-        else if (t.scalar_type() == c10::ScalarType::Int)
-        {
-            int i = t.item<int>();
-            memcpy((void*)data.data(), (const void*)&i, data.size());
-        }
-        else if (t.scalar_type() == c10::ScalarType::Double)
-        {
-            double f = t.item<double>();
-            memcpy((void*)data.data(), (const void*)&f, data.size());
-        }
-        else if (t.scalar_type() == c10::ScalarType::Float)
-        {
-            float f = t.item<float>();
-            memcpy((void*)data.data(), (const void*)&f, data.size());
-        }
-        else
-        {
-            fprintf(stderr, "unknown Attribute tensor scalar type %d\n", type);
-        }
-
-        return;
-    }
-
-    shape.resize(ndim);
-    for (int i = 0; i < ndim; i++)
-        shape[i] = t.size(i);
-
-    if (shape.size() > 0)
-    {
-        data.resize(elemcount() * type_to_elemsize(type));
-        memcpy((void*)data.data(), (const void*)t.cpu().contiguous().data_ptr(), data.size());
-    }
-}
-#endif // BUILD_PNNX
-
 Attribute::Attribute(const std::initializer_list<int>& _shape, const std::vector<float>& t)
 {
     type = 1;
@@ -783,6 +489,43 @@ std::string Parameter::encode_to_string(const Parameter& param)
     return std::string();
 }
 
+bool Operator::has_param(const std::string& key) const
+{
+    return params.find(key) != params.end();
+}
+
+bool Operator::has_attr(const std::string& key) const
+{
+    return attrs.find(key) != attrs.end();
+}
+
+bool Operator::has_input(const std::string& key) const
+{
+    return std::find(inputnames.begin(), inputnames.end(), key) != inputnames.end();
+}
+
+Operand* Operator::named_input(const std::string& key)
+{
+    for (size_t i = 0; i < inputnames.size(); i++)
+    {
+        if (inputnames[i] == key)
+            return inputs[i];
+    }
+
+    return 0;
+}
+
+const Operand* Operator::named_input(const std::string& key) const
+{
+    for (size_t i = 0; i < inputnames.size(); i++)
+    {
+        if (inputnames[i] == key)
+            return inputs[i];
+    }
+
+    return 0;
+}
+
 Graph::Graph()
 {
 }
@@ -1200,7 +943,7 @@ static std::string sanitize_identifier(const std::string& s)
     std::string ss = s;
     for (size_t i = 0; i < ss.size(); i++)
     {
-        if (ss[i] == '.' || ss[i] == ':')
+        if (ss[i] == '.' || ss[i] == ':' || ss[i] == '/')
             ss[i] = '_';
     }
 
@@ -1305,7 +1048,7 @@ static std::string expand_expression(const Operator* op)
             if (t == "floor") unaryop = "torch.floor";
             if (t == "log") unaryop = "torch.log";
             if (t == "log10") unaryop = "torch.log10";
-            if (t == "neg") unaryop = "torch.neg";
+            if (t == "neg") unaryop = "-";
             if (t == "reciprocal") unaryop = "torch.reciprocal";
             if (t == "round") unaryop = "torch.round";
             if (t == "rsqrt") unaryop = "torch.rsqrt";
@@ -1438,13 +1181,13 @@ static std::string expand_expression(const Operator* op)
 
 static std::string make_slice_expression(const Operator* op)
 {
-    for (size_t j = 0; j < op->inputnames.size(); j++)
-    {
-        fprintf(stderr, "make_slice_expression %s %s\n", op->inputnames[j].c_str(), op->inputs[j]->name.c_str());
-    }
+    // for (size_t j = 0; j < op->inputnames.size(); j++)
+    // {
+    //     fprintf(stderr, "make_slice_expression %s %s\n", op->inputnames[j].c_str(), op->inputs[j]->name.c_str());
+    // }
 
     std::vector<int> dims;
-    if (op->params.find("dims") != op->params.end())
+    if (op->has_param("dims"))
     {
         dims = op->params.at("dims").ai;
     }
@@ -1453,66 +1196,158 @@ static std::string make_slice_expression(const Operator* op)
         dims.push_back(op->params.at("dim").i);
     }
 
-    std::string r;
+    std::string pr;
+    std::string nr;
 
     int last_dim = -1;
     const int ndim = (int)dims.size();
     for (int i = 0; i < ndim; i++)
     {
         int dim = dims[i];
+        std::string& r = dim < 0 ? nr : pr;
+
         for (int j = last_dim + 1; j < dim; j++)
         {
             r += ":,";
         }
         last_dim = dim;
 
-        if (op->params.find("starts") != op->params.end())
+        bool is_select = false;
+        if (op->has_param("select"))
+        {
+            int select = op->params.at("select").i;
+            if (select != INT_MAX)
+            {
+                r += std::to_string(select);
+                is_select = true;
+            }
+        }
+        if (op->has_param("selects"))
+        {
+            std::vector<int> selects = op->params.at("selects").ai;
+            int select = selects[i];
+            if (select != INT_MAX)
+            {
+                r += std::to_string(select);
+                is_select = true;
+            }
+        }
+        if (op->has_input("select"))
+        {
+            r += std::string("v_") + sanitize_identifier(op->named_input("select")->name);
+            is_select = true;
+        }
+        if (op->has_input("selects"))
+        {
+            // must be pnnx.SliceIndexes
+            const Operator* op_sliceindexes = op->named_input("selects")->producer;
+            const std::string& index = op_sliceindexes->params.at("indexes").as[i];
+            if (index[0] == '@')
+            {
+                int selecti = std::stoi(index.substr(1));
+                r += std::string("v_") + sanitize_identifier(op_sliceindexes->inputs[selecti]->name);
+                is_select = true;
+            }
+            else
+            {
+                int select = std::stoi(index);
+                if (select != INT_MAX)
+                {
+                    r += std::to_string(select);
+                    is_select = true;
+                }
+            }
+        }
+
+        if (is_select)
+        {
+            if (i + 1 != ndim)
+                r += ',';
+            continue;
+        }
+
+        if (op->has_param("start"))
+        {
+            int start = op->params.at("start").i;
+            if (start != 0)
+                r += std::to_string(start);
+        }
+        else if (op->has_param("starts"))
         {
             std::vector<int> starts = op->params.at("starts").ai;
             int start = starts[i];
-
             if (start != 0)
                 r += std::to_string(start);
         }
-        else
+        else if (op->has_input("start"))
         {
-            fprintf(stderr, "find start\n");
-            // find start
-            for (size_t j = 0; j < op->inputnames.size(); j++)
+            r += std::string("v_") + sanitize_identifier(op->named_input("start")->name);
+        }
+        else // if (op->has_input("starts"))
+        {
+            // must be pnnx.SliceIndexes
+            const Operator* op_sliceindexes = op->named_input("starts")->producer;
+            const std::string& index = op_sliceindexes->params.at("indexes").as[i];
+            if (index[0] == '@')
             {
-                if (op->inputnames[j] == "start")
-                {
-                    r += std::string("v_") + sanitize_identifier(op->inputs[j]->name);
-
-                    fprintf(stderr, "find start %s\n", op->inputs[j]->name.c_str());
-                    break;
-                }
+                int starti = std::stoi(index.substr(1));
+                r += std::string("v_") + sanitize_identifier(op_sliceindexes->inputs[starti]->name);
+            }
+            else
+            {
+                int start = std::stoi(index);
+                if (start != 0)
+                    r += std::to_string(start);
             }
         }
 
         r += ':';
 
-        if (op->params.find("ends") != op->params.end())
+        if (op->has_param("end"))
+        {
+            int end = op->params.at("end").i;
+            if (end != INT_MAX)
+                r += std::to_string(end);
+        }
+        else if (op->has_param("ends"))
         {
             std::vector<int> ends = op->params.at("ends").ai;
             int end = ends[i];
             if (end != INT_MAX)
                 r += std::to_string(end);
         }
-        else
+        else if (op->has_input("end"))
+        {
+            r += std::string("v_") + sanitize_identifier(op->named_input("end")->name);
+        }
+        else // if (op->has_input("ends"))
         {
-            // find end
-            for (size_t j = 0; j < op->inputnames.size(); j++)
+            // must be pnnx.SliceIndexes
+            const Operator* op_sliceindexes = op->named_input("ends")->producer;
+            const std::string& index = op_sliceindexes->params.at("indexes").as[i];
+            if (index[0] == '@')
             {
-                if (op->inputnames[j] == "end")
-                {
-                    r += std::string("v_") + sanitize_identifier(op->inputs[j]->name);
-                    break;
-                }
+                int endi = std::stoi(index.substr(1));
+                r += std::string("v_") + sanitize_identifier(op_sliceindexes->inputs[endi]->name);
+            }
+            else
+            {
+                int end = std::stoi(index);
+                if (end != INT_MAX)
+                    r += std::to_string(end);
             }
         }
 
-        if (op->params.find("steps") != op->params.end())
+        if (op->has_param("step"))
+        {
+            int step = op->params.at("step").i;
+            if (step != 1)
+            {
+                r += ':';
+                r += std::to_string(step);
+            }
+        }
+        else if (op->has_param("steps"))
         {
             std::vector<int> steps = op->params.at("steps").ai;
             int step = steps[i];
@@ -1522,16 +1357,29 @@ static std::string make_slice_expression(const Operator* op)
                 r += std::to_string(step);
             }
         }
-        else
+        else if (op->has_input("step"))
+        {
+            r += ':';
+            r += std::string("v_") + sanitize_identifier(op->named_input("step")->name);
+        }
+        else // if (op->has_input("steps"))
         {
-            // find step
-            for (size_t j = 0; j < op->inputnames.size(); j++)
+            // must be pnnx.SliceIndexes
+            const Operator* op_sliceindexes = op->named_input("steps")->producer;
+            const std::string& index = op_sliceindexes->params.at("indexes").as[i];
+            if (index[0] == '@')
+            {
+                int stepi = std::stoi(index.substr(1));
+                r += ':';
+                r += std::string("v_") + sanitize_identifier(op_sliceindexes->inputs[stepi]->name);
+            }
+            else
             {
-                if (op->inputnames[j] == "step")
+                int step = std::stoi(index);
+                if (step != 1)
                 {
                     r += ':';
-                    r += std::string("v_") + sanitize_identifier(op->inputs[j]->name);
-                    break;
+                    r += std::to_string(step);
                 }
             }
         }
@@ -1540,7 +1388,13 @@ static std::string make_slice_expression(const Operator* op)
             r += ',';
     }
 
-    return r;
+    if (!pr.empty() && !nr.empty())
+        return pr + "...," + nr;
+
+    if (pr.empty() && !nr.empty())
+        return std::string("...,") + nr;
+
+    return pr + nr;
 }
 
 static std::string make_index_expression(const Operator* op)
@@ -1810,29 +1664,48 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
                 }
             }
 
-            if (is_running_mean_var)
-            {
-                fprintf(pyfp, "        self.%s_%s = self.load_pnnx_bin_as_tensor(archive, '%s.%s', (", sanitize_identifier(op->name).c_str(), sanitize_identifier(key).c_str(), op->name.c_str(), key.c_str());
-            }
-            else
-            {
-                fprintf(pyfp, "        self.%s_%s = self.load_pnnx_bin_as_parameter(archive, '%s.%s', (", sanitize_identifier(op->name).c_str(), sanitize_identifier(key).c_str(), op->name.c_str(), key.c_str());
-            }
-
+            bool is_empty = false;
             for (size_t i = 0; i < attr.shape.size(); i++)
             {
-                fprintf(pyfp, "%d", attr.shape[i]);
-                if (i + 1 != attr.shape.size())
-                    fprintf(pyfp, ",");
+                if (attr.shape[i] == 0)
+                    is_empty = true;
             }
 
-            if (attr.type == 1 || attr.type == 2 || attr.type == 3)
+            if (is_empty)
             {
-                fprintf(pyfp, "), '%s')\n", type_to_numpy_string(attr.type));
+                fprintf(pyfp, "        self.%s_%s = torch.from_numpy(np.empty((", sanitize_identifier(op->name).c_str(), sanitize_identifier(key).c_str());
+
+                for (size_t i = 0; i < attr.shape.size(); i++)
+                {
+                    fprintf(pyfp, "%d,", attr.shape[i]);
+                }
+
+                fprintf(pyfp, "), dtype='%s'))\n", type_to_numpy_string(attr.type));
             }
             else
             {
-                fprintf(pyfp, "), '%s', requires_grad=False)\n", type_to_numpy_string(attr.type));
+                if (is_running_mean_var)
+                {
+                    fprintf(pyfp, "        self.%s_%s = self.load_pnnx_bin_as_tensor(archive, '%s.%s', (", sanitize_identifier(op->name).c_str(), sanitize_identifier(key).c_str(), op->name.c_str(), key.c_str());
+                }
+                else
+                {
+                    fprintf(pyfp, "        self.%s_%s = self.load_pnnx_bin_as_parameter(archive, '%s.%s', (", sanitize_identifier(op->name).c_str(), sanitize_identifier(key).c_str(), op->name.c_str(), key.c_str());
+                }
+
+                for (size_t i = 0; i < attr.shape.size(); i++)
+                {
+                    fprintf(pyfp, "%d,", attr.shape[i]);
+                }
+
+                if (attr.type == 1 || attr.type == 2 || attr.type == 3)
+                {
+                    fprintf(pyfp, "), '%s')\n", type_to_numpy_string(attr.type));
+                }
+                else
+                {
+                    fprintf(pyfp, "), '%s', requires_grad=False)\n", type_to_numpy_string(attr.type));
+                }
             }
         }
 
@@ -1879,6 +1752,9 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
             if (op->type == "pnnx.Input" || op->type == "pnnx.Output")
                 continue;
 
+            if (op->type == "pnnx.SliceIndexes")
+                continue;
+
             fprintf(pyfp, "        ");
 
             if (op->type == "pnnx.Expression")
@@ -2305,7 +2181,14 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
                     const Parameter& param = it.second;
                     if (param.type == 0)
                     {
-                        fprintf(pyfp, "None");
+                        if (op->type == "Tensor.index_put" && it.first == "values")
+                        {
+                            fprintf(pyfp, "torch.tensor(False)");
+                        }
+                        else
+                        {
+                            fprintf(pyfp, "None");
+                        }
                     }
                     if (param.type == 1)
                     {
@@ -2316,11 +2199,25 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
                     }
                     if (param.type == 2)
                     {
-                        fprintf(pyfp, "%d", param.i);
+                        if (op->type == "Tensor.index_put" && it.first == "values")
+                        {
+                            fprintf(pyfp, "torch.tensor(%d)", param.i);
+                        }
+                        else
+                        {
+                            fprintf(pyfp, "%d", param.i);
+                        }
                     }
                     if (param.type == 3)
                     {
-                        fprintf(pyfp, "%f", param.f);
+                        if (op->type == "Tensor.index_put" && it.first == "values")
+                        {
+                            fprintf(pyfp, "torch.tensor(%f)", param.f);
+                        }
+                        else
+                        {
+                            fprintf(pyfp, "%f", param.f);
+                        }
                     }
                     if (param.type == 4)
                     {
@@ -2328,9 +2225,27 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
                         {
                             fprintf(pyfp, "%s", param.s.c_str());
                         }
+                        else if (op->type == "Tensor.index_put" && it.first == "values")
+                        {
+                            if (param.s == "inf" || param.s == "-inf")
+                            {
+                                fprintf(pyfp, "torch.tensor(float(\'%s\'))", param.s.c_str());
+                            }
+                            else
+                            {
+                                fprintf(pyfp, "torch.tensor(\'%s\')", param.s.c_str());
+                            }
+                        }
                         else
                         {
-                            fprintf(pyfp, "\'%s\'", param.s.c_str());
+                            if (param.s == "inf" || param.s == "-inf")
+                            {
+                                fprintf(pyfp, "float(\'%s\')", param.s.c_str());
+                            }
+                            else
+                            {
+                                fprintf(pyfp, "\'%s\'", param.s.c_str());
+                            }
                         }
                     }
                     if (param.type == 5)
@@ -2856,7 +2771,8 @@ int Graph::parse(const std::string& param)
 void Operand::remove_consumer(const Operator* c)
 {
     auto it = std::find(consumers.begin(), consumers.end(), c);
-    consumers.erase(it);
+    if (it != consumers.end())
+        consumers.erase(it);
 }
 
 Operator* Graph::new_operator(const std::string& type, const std::string& name)
@@ -2886,37 +2802,6 @@ Operator* Graph::new_operator_after(const std::string& type, const std::string&
     return op;
 }
 
-#if BUILD_PNNX
-Operand* Graph::new_operand(const torch::jit::Value* v)
-{
-    Operand* r = new Operand;
-    r->name = v->debugName();
-
-    r->type = -1;
-
-    auto pt = v->type()->cast<c10::TensorType>();
-    if (pt)
-    {
-        if (pt->scalarType().has_value() && pt->dim().has_value())
-        {
-            r->type = get_at_tensor_type(pt->scalarType().value());
-            const int ndim = (int)pt->dim().value();
-            r->shape.resize(ndim);
-            for (int i = 0; i < ndim; i++)
-            {
-                if (pt->sizes()[i].has_value())
-                    r->shape[i] = (int)pt->sizes()[i].value();
-                else
-                    r->shape[i] = -1;
-            }
-        }
-    }
-
-    operands.push_back(r);
-    return r;
-}
-#endif // BUILD_PNNX
-
 Operand* Graph::new_operand(const std::string& name)
 {
     Operand* r = new Operand;
diff --git a/tools/pnnx/src/ir.h b/tools/pnnx/src/ir.h
index 96ad57cbf21..58e5cd638c2 100644
--- a/tools/pnnx/src/ir.h
+++ b/tools/pnnx/src/ir.h
@@ -24,7 +24,7 @@
 #include <string>
 #include <vector>
 
-#if BUILD_PNNX
+#if BUILD_TORCH2PNNX
 namespace torch {
 namespace jit {
 struct Value;
@@ -34,7 +34,20 @@ struct Node;
 namespace at {
 class Tensor;
 }
-#endif // BUILD_PNNX
+#endif // BUILD_TORCH2PNNX
+
+#if BUILD_ONNX2PNNX
+namespace onnx {
+class AttributeProto;
+class TensorProto;
+class ValueInfoProto;
+} // namespace onnx
+namespace pnnx {
+namespace onnx2pnnx {
+class OnnxAttributeProxy;
+} // namespace onnx2pnnx
+} // namespace pnnx
+#endif // BUILD_ONNX2PNNX
 
 namespace pnnx {
 
@@ -102,6 +115,17 @@ class Parameter
         : type(5), ai(_ai)
     {
     }
+    Parameter(const std::vector<int64_t>& _ai)
+        : type(5)
+    {
+        for (const auto& x : _ai)
+        {
+            int64_t _l = x;
+            if (_l == std::numeric_limits<int64_t>::max()) _l = INT_MAX;
+            if (_l == std::numeric_limits<int64_t>::min()) _l = INT_MIN;
+            ai.push_back((int)_l);
+        }
+    }
     Parameter(const std::initializer_list<float>& _af)
         : type(6), af(_af)
     {
@@ -165,10 +189,14 @@ class Parameter
             ac.push_back(std::complex<float>(x));
     }
 
-#if BUILD_PNNX
+#if BUILD_TORCH2PNNX
     Parameter(const torch::jit::Node* value_node);
     Parameter(const torch::jit::Value* value);
-#endif // BUILD_PNNX
+#endif // BUILD_TORCH2PNNX
+#if BUILD_ONNX2PNNX
+    Parameter(const onnx::AttributeProto& attr);
+    Parameter(const onnx2pnnx::OnnxAttributeProxy& attr);
+#endif // BUILD_ONNX2PNNX
 
     static Parameter parse_from_string(const std::string& value);
     static std::string encode_to_string(const Parameter& param);
@@ -200,9 +228,12 @@ class Attribute
     {
     }
 
-#if BUILD_PNNX
+#if BUILD_TORCH2PNNX
     Attribute(const at::Tensor& t);
-#endif // BUILD_PNNX
+#endif
+#if BUILD_ONNX2PNNX
+    Attribute(const onnx::TensorProto& t);
+#endif
 
     Attribute(const std::initializer_list<int>& shape, const std::vector<float>& t);
 
@@ -256,6 +287,12 @@ class Operand
 class Operator
 {
 public:
+    bool has_param(const std::string& key) const;
+    bool has_attr(const std::string& key) const;
+    bool has_input(const std::string& key) const;
+    Operand* named_input(const std::string& key);
+    const Operand* named_input(const std::string& key) const;
+
     std::vector<Operand*> inputs;
     std::vector<Operand*> outputs;
 
@@ -293,9 +330,13 @@ class Graph
 
     Operator* new_operator_after(const std::string& type, const std::string& name, const Operator* cur);
 
-#if BUILD_PNNX
+#if BUILD_TORCH2PNNX
     Operand* new_operand(const torch::jit::Value* v);
 #endif
+#if BUILD_ONNX2PNNX
+    Operand* new_operand(const onnx::ValueInfoProto& value);
+    Operand* new_operand(const onnx::TensorProto& t);
+#endif
 
     Operand* new_operand(const std::string& name);
 
diff --git a/tools/pnnx/src/load_onnx.cpp b/tools/pnnx/src/load_onnx.cpp
new file mode 100644
index 00000000000..1ea1aa4973a
--- /dev/null
+++ b/tools/pnnx/src/load_onnx.cpp
@@ -0,0 +1,473 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "load_onnx.h"
+
+#include "onnx.pb.h"
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+
+#include <fstream>
+
+#include "ir.h"
+
+#include "pass_onnx/canonicalize.h"
+#include "pass_onnx/dead_code_elimination.h"
+#include "pass_onnx/eliminate_noop.h"
+#include "pass_onnx/fold_constants.h"
+#include "pass_onnx/inline_containers.h"
+#include "pass_onnx/model_stat.h"
+#include "pass_onnx/shape_inference.h"
+
+#include "pass_onnx.h"
+
+namespace pnnx {
+
+static size_t type_to_elemsize(int type)
+{
+    if (type == 1) return 4;
+    if (type == 2) return 8;
+    if (type == 3) return 2;
+    if (type == 4) return 4;
+    if (type == 5) return 8;
+    if (type == 6) return 2;
+    if (type == 7) return 1;
+    if (type == 8) return 1;
+    if (type == 9) return 1;
+    if (type == 10) return 8;
+    if (type == 11) return 16;
+    if (type == 12) return 4;
+    return 0; // null
+}
+
+static int get_onnx_tensor_type(int32_t dt)
+{
+    if (dt == onnx::TensorProto::FLOAT) return 1;
+    if (dt == onnx::TensorProto::DOUBLE) return 2;
+    if (dt == onnx::TensorProto::FLOAT16) return 3;
+    if (dt == onnx::TensorProto::INT32) return 4;
+    if (dt == onnx::TensorProto::INT64) return 5;
+    if (dt == onnx::TensorProto::INT16) return 6;
+    if (dt == onnx::TensorProto::INT8) return 7;
+    if (dt == onnx::TensorProto::UINT8) return 8;
+    if (dt == onnx::TensorProto::BOOL) return 9;
+    if (dt == onnx::TensorProto::COMPLEX64) return 10;
+    if (dt == onnx::TensorProto::COMPLEX128) return 11;
+    return 0; // unknown type
+}
+
+Parameter::Parameter(const onnx::AttributeProto& attr)
+{
+    type = 0;
+
+    switch (attr.type())
+    {
+    case onnx::AttributeProto::INT:
+    {
+        type = 2;
+        int64_t i64 = attr.i();
+        if (i64 == std::numeric_limits<int64_t>::max()) i64 = INT_MAX;
+        if (i64 == std::numeric_limits<int64_t>::min()) i64 = INT_MIN;
+        i = (int)i64;
+        break;
+    }
+    case onnx::AttributeProto::FLOAT:
+    {
+        type = 3;
+        f = attr.f();
+        break;
+    }
+    case onnx::AttributeProto::STRING:
+    {
+        type = 4;
+        s = attr.s();
+        break;
+    }
+    case onnx::AttributeProto::INTS:
+    {
+        type = 5;
+        for (int i = 0; i < attr.ints().size(); i++)
+        {
+            int64_t i64 = attr.ints().at(i);
+            if (i64 == std::numeric_limits<int64_t>::max()) i64 = INT_MAX;
+            if (i64 == std::numeric_limits<int64_t>::min()) i64 = INT_MIN;
+            ai.push_back(i64);
+        }
+        break;
+    }
+    case onnx::AttributeProto::FLOATS:
+    {
+        type = 6;
+        for (int i = 0; i < attr.floats().size(); i++)
+        {
+            float f = attr.floats().at(i);
+            af.push_back(f);
+        }
+        break;
+    }
+    case onnx::AttributeProto::STRINGS:
+    {
+        type = 7;
+        for (int i = 0; i < attr.strings().size(); i++)
+        {
+            std::string s = attr.strings().at(i);
+            as.push_back(s);
+        }
+        break;
+    }
+    case onnx::AttributeProto::TENSOR:
+    {
+        const onnx::TensorProto& tensor = attr.t();
+
+        int64_t numel = 1;
+        for (int k = 0; k < tensor.dims_size(); k++)
+        {
+            numel *= tensor.dims(k);
+        }
+
+        if (numel == 1)
+        {
+            if (tensor.data_type() == onnx::TensorProto::INT32)
+            {
+                type = 2;
+                if (tensor.has_raw_data())
+                {
+                    // assert tensor.raw_data().size() == 4
+                    i = ((int*)tensor.raw_data().data())[0];
+                }
+                else
+                {
+                    // assert tensor.int32_data().size() == 1
+                    i = tensor.int32_data().at(0);
+                }
+            }
+            else if (tensor.data_type() == onnx::TensorProto::INT64)
+            {
+                type = 2;
+                int64_t i64;
+                if (tensor.has_raw_data())
+                {
+                    // assert tensor.raw_data().size() == 8
+                    i64 = ((int64_t*)tensor.raw_data().data())[0];
+                }
+                else
+                {
+                    // assert tensor.int64_data().size() == 1
+                    i64 = tensor.int64_data().at(0);
+                }
+                if (i64 == std::numeric_limits<int64_t>::max()) i64 = INT_MAX;
+                if (i64 == std::numeric_limits<int64_t>::min()) i64 = INT_MIN;
+                i = (int)i64;
+            }
+            else if (tensor.data_type() == onnx::TensorProto::FLOAT)
+            {
+                type = 3;
+                if (tensor.has_raw_data())
+                {
+                    // assert tensor.raw_data().size() == 4
+                    f = ((float*)tensor.raw_data().data())[0];
+                }
+                else
+                {
+                    // assert tensor.float_data().size() == 1
+                    f = tensor.float_data().at(0);
+                }
+            }
+            else
+            {
+                fprintf(stderr, "unknown Node attribute tensor data type %d\n", (int)tensor.data_type());
+            }
+        }
+        else
+        {
+            // constant tensor will become pnnx attribute node later
+            type = 8;
+        }
+        break;
+    }
+    default:
+    {
+        fprintf(stderr, "unknown Node attribute type %d\n", (int)attr.type());
+        break;
+    }
+    }
+}
+
+Parameter::Parameter(const onnx2pnnx::OnnxAttributeProxy& attr)
+    : Parameter(attr.attr)
+{
+}
+
+Attribute::Attribute(const onnx::TensorProto& t)
+{
+    type = get_onnx_tensor_type(t.data_type());
+
+    const int ndim = (int)t.dims_size();
+
+    if (ndim == 0)
+    {
+        shape = {1};
+
+        data.resize(type_to_elemsize(type));
+
+        if (t.has_raw_data())
+        {
+            // assert t.raw_data().size() == type_to_elemsize(type)
+            memcpy((void*)data.data(), (const void*)t.raw_data().data(), t.raw_data().size());
+        }
+        else if (t.data_type() == onnx::TensorProto::INT64)
+        {
+            int64_t i = t.int64_data().at(0);
+            memcpy((void*)data.data(), (const void*)&i, data.size());
+        }
+        else if (t.data_type() == onnx::TensorProto::INT32)
+        {
+            int i = t.int32_data().at(0);
+            memcpy((void*)data.data(), (const void*)&i, data.size());
+        }
+        else if (t.data_type() == onnx::TensorProto::DOUBLE)
+        {
+            double f = t.double_data().at(0);
+            memcpy((void*)data.data(), (const void*)&f, data.size());
+        }
+        else if (t.data_type() == onnx::TensorProto::FLOAT)
+        {
+            float f = t.float_data().at(0);
+            memcpy((void*)data.data(), (const void*)&f, data.size());
+        }
+        else
+        {
+            fprintf(stderr, "unknown Attribute tensor scalar type %d\n", type);
+        }
+
+        return;
+    }
+
+    shape.resize(ndim);
+    for (int i = 0; i < ndim; i++)
+        shape[i] = t.dims(i);
+
+    if (shape.size() > 0)
+    {
+        data.resize(elemcount() * type_to_elemsize(type));
+
+        if (t.has_raw_data())
+        {
+            memcpy((void*)data.data(), (const void*)t.raw_data().data(), data.size());
+        }
+        else if (t.data_type() == onnx::TensorProto::INT64)
+        {
+            memcpy((void*)data.data(), (const void*)t.int64_data().data(), data.size());
+        }
+        else if (t.data_type() == onnx::TensorProto::INT32)
+        {
+            memcpy((void*)data.data(), (const void*)t.int32_data().data(), data.size());
+        }
+        else if (t.data_type() == onnx::TensorProto::DOUBLE)
+        {
+            memcpy((void*)data.data(), (const void*)t.double_data().data(), data.size());
+        }
+        else if (t.data_type() == onnx::TensorProto::FLOAT)
+        {
+            memcpy((void*)data.data(), (const void*)t.float_data().data(), data.size());
+        }
+        else
+        {
+            fprintf(stderr, "unknown Attribute tensor scalar type %d\n", type);
+        }
+    }
+}
+
+Operand* Graph::new_operand(const onnx::ValueInfoProto& value)
+{
+    Operand* r = new Operand;
+    r->name = value.name();
+
+    int32_t et = value.type().tensor_type().elem_type();
+    r->type = get_onnx_tensor_type(et);
+
+    const onnx::TensorShapeProto& tensor_shape = value.type().tensor_type().shape();
+    r->shape.resize(tensor_shape.dim_size());
+    for (int z = 0; z < tensor_shape.dim_size(); z++)
+    {
+        r->shape[z] = tensor_shape.dim(z).dim_value();
+    }
+
+    operands.push_back(r);
+    return r;
+}
+
+Operand* Graph::new_operand(const onnx::TensorProto& t)
+{
+    Operand* r = new Operand;
+    r->name = t.name();
+
+    r->type = get_onnx_tensor_type(t.data_type());
+
+    const int ndim = (int)t.dims_size();
+    if (ndim == 0)
+    {
+        r->shape = {1};
+    }
+    else
+    {
+        r->shape.resize(ndim);
+        for (int i = 0; i < ndim; i++)
+            r->shape[i] = t.dims(i);
+    }
+
+    operands.push_back(r);
+    return r;
+}
+
+static bool read_proto_from_binary(const char* filepath, onnx::ModelProto* message)
+{
+    std::ifstream fs(filepath, std::ifstream::in | std::ifstream::binary);
+    if (!fs.is_open())
+    {
+        fprintf(stderr, "open failed %s\n", filepath);
+        return false;
+    }
+
+    google::protobuf::io::IstreamInputStream input(&fs);
+    google::protobuf::io::CodedInputStream codedstr(&input);
+
+#if GOOGLE_PROTOBUF_VERSION >= 3011000
+    codedstr.SetTotalBytesLimit(INT_MAX);
+#else
+    codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
+#endif
+
+    bool success = message->ParseFromCodedStream(&codedstr);
+
+    fs.close();
+
+    return success;
+}
+
+static double get_current_time()
+{
+    auto now = std::chrono::high_resolution_clock::now();
+    auto usec = std::chrono::duration_cast<std::chrono::microseconds>(now.time_since_epoch());
+    return usec.count() / 1000.0;
+}
+
+int load_onnx(const std::string& onnxpath, Graph& pnnx_graph)
+{
+    onnx::ModelProto model;
+
+    bool s1 = read_proto_from_binary(onnxpath.c_str(), &model);
+    if (!s1)
+    {
+        fprintf(stderr, "read_proto_from_binary failed\n");
+        return -1;
+    }
+
+    fprintf(stderr, "############# pass_level0 onnx \n");
+
+    onnx2pnnx::ModelStat oldstat = onnx2pnnx::get_model_stat(model);
+
+    fprintf(stderr, "%-30s", "inline_containers ... ");
+
+    double t0 = get_current_time();
+
+    onnx2pnnx::inline_containers(model);
+
+    double t1 = get_current_time();
+
+    fprintf(stderr, "%10.2fms\n", t1 - t0);
+
+    fprintf(stderr, "%-30s", "eliminate_noop ... ");
+
+    t0 = get_current_time();
+
+    onnx2pnnx::eliminate_noop(model);
+
+    t1 = get_current_time();
+
+    fprintf(stderr, "%10.2fms\n", t1 - t0);
+
+    fprintf(stderr, "%-30s", "dead_code_elimination ... ");
+
+    t0 = get_current_time();
+
+    onnx2pnnx::dead_code_elimination(model);
+
+    t1 = get_current_time();
+
+    fprintf(stderr, "%10.2fms\n", t1 - t0);
+
+    fprintf(stderr, "%-30s", "fold_constants ... ");
+
+    t0 = get_current_time();
+
+    onnx2pnnx::fold_constants(model);
+
+    t1 = get_current_time();
+
+    fprintf(stderr, "%10.2fms\n", t1 - t0);
+
+    fprintf(stderr, "%-30s", "dead_code_elimination ... ");
+
+    t0 = get_current_time();
+
+    onnx2pnnx::dead_code_elimination(model);
+
+    t1 = get_current_time();
+
+    fprintf(stderr, "%10.2fms\n", t1 - t0);
+
+    fprintf(stderr, "%-30s", "canonicalize ... ");
+
+    t0 = get_current_time();
+
+    onnx2pnnx::canonicalize(model);
+
+    t1 = get_current_time();
+
+    fprintf(stderr, "%10.2fms\n", t1 - t0);
+
+    fprintf(stderr, "%-30s", "shape_inference ... ");
+
+    t0 = get_current_time();
+
+    onnx2pnnx::shape_inference(model);
+
+    t1 = get_current_time();
+
+    fprintf(stderr, "%10.2fms\n", t1 - t0);
+
+    // save
+    std::fstream output("tmp2.onnx", std::ios::out | std::ios::trunc | std::ios::binary);
+    if (!model.SerializeToOstream(&output))
+    {
+        fprintf(stderr, "write onnx failed\n");
+        return -1;
+    }
+
+    onnx2pnnx::ModelStat newstat = onnx2pnnx::get_model_stat(model);
+
+    onnx2pnnx::print_model_stat(oldstat, newstat);
+
+    fprintf(stderr, "############# pass_level1 onnx\n");
+
+    pass_onnx(model, pnnx_graph);
+
+    return 0;
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/load_onnx.h b/tools/pnnx/src/load_onnx.h
new file mode 100644
index 00000000000..e1b99757e86
--- /dev/null
+++ b/tools/pnnx/src/load_onnx.h
@@ -0,0 +1,26 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef PNNX_LOAD_ONNX_H
+#define PNNX_LOAD_ONNX_H
+
+#include "ir.h"
+
+namespace pnnx {
+
+int load_onnx(const std::string& onnxpath, Graph& g);
+
+} // namespace pnnx
+
+#endif // PNNX_LOAD_ONNX_H
diff --git a/tools/pnnx/src/load_torchscript.cpp b/tools/pnnx/src/load_torchscript.cpp
new file mode 100644
index 00000000000..12cc4129fb2
--- /dev/null
+++ b/tools/pnnx/src/load_torchscript.cpp
@@ -0,0 +1,552 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "load_torchscript.h"
+
+#if _WIN32
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include <torch/script.h>
+#include <torch/csrc/api/include/torch/version.h>
+#ifdef PNNX_TORCHVISION
+namespace vision {
+int64_t cuda_version();
+} // namespace vision
+#endif
+
+#include "pass_level0.h"
+#include "pass_level1.h"
+
+namespace pnnx {
+
+static int get_at_tensor_type(const at::ScalarType& st)
+{
+    if (st == c10::ScalarType::Float) return 1;
+    if (st == c10::ScalarType::Double) return 2;
+    if (st == c10::ScalarType::Half) return 3;
+    if (st == c10::ScalarType::Int) return 4;
+    if (st == c10::ScalarType::QInt32) return 4;
+    if (st == c10::ScalarType::Long) return 5;
+    if (st == c10::ScalarType::Short) return 6;
+    if (st == c10::ScalarType::Char) return 7;
+    if (st == c10::ScalarType::QInt8) return 7;
+    if (st == c10::ScalarType::Byte) return 8;
+    if (st == c10::ScalarType::QUInt8) return 8;
+    if (st == c10::ScalarType::Bool) return 9;
+    if (st == c10::ScalarType::ComplexFloat) return 10;
+    if (st == c10::ScalarType::ComplexDouble) return 11;
+    if (st == c10::ScalarType::ComplexHalf) return 12;
+    return 0; // unknown type
+}
+
+static size_t type_to_elemsize(int type)
+{
+    if (type == 1) return 4;
+    if (type == 2) return 8;
+    if (type == 3) return 2;
+    if (type == 4) return 4;
+    if (type == 5) return 8;
+    if (type == 6) return 2;
+    if (type == 7) return 1;
+    if (type == 8) return 1;
+    if (type == 9) return 1;
+    if (type == 10) return 8;
+    if (type == 11) return 16;
+    if (type == 12) return 4;
+    return 0; // null
+}
+
+Parameter::Parameter(const torch::jit::Node* value_node)
+{
+    type = 0;
+
+    if (value_node->kind() == c10::prim::Constant)
+    {
+        if (value_node->output()->type()->kind() == c10::TypeKind::NoneType)
+        {
+            type = 0;
+            return;
+        }
+
+        if (!value_node->hasAttribute(torch::jit::attr::value))
+        {
+            fprintf(stderr, "no attribute value\n");
+            value_node->dump();
+            return;
+        }
+
+        switch (value_node->output()->type()->kind())
+        {
+        case c10::TypeKind::NoneType:
+        {
+            type = 0;
+            break;
+        }
+        case c10::TypeKind::BoolType:
+        {
+            type = 1;
+            b = value_node->i(torch::jit::attr::value);
+            break;
+        }
+        case c10::TypeKind::IntType:
+        {
+            type = 2;
+            int64_t i64 = value_node->i(torch::jit::attr::value);
+            if (i64 == std::numeric_limits<int64_t>::max()) i64 = INT_MAX;
+            if (i64 == std::numeric_limits<int64_t>::min()) i64 = INT_MIN;
+            i = (int)i64;
+            break;
+        }
+        case c10::TypeKind::FloatType:
+        {
+            type = 3;
+            f = (float)value_node->f(torch::jit::attr::value);
+            break;
+        }
+        case c10::TypeKind::StringType:
+        {
+            type = 4;
+            s = value_node->s(torch::jit::attr::value);
+            break;
+        }
+        case c10::TypeKind::DeviceObjType:
+        {
+            type = 4;
+            s = value_node->s(torch::jit::attr::value);
+            break;
+        }
+#if TORCH_VERSION_MAJOR >= 2 || (TORCH_VERSION_MAJOR >= 1 && TORCH_VERSION_MINOR >= 9)
+        case c10::TypeKind::ComplexType:
+        {
+            type = 10;
+            c = std::complex<float>(value_node->c(torch::jit::attr::value));
+            break;
+        }
+#endif
+        case c10::TypeKind::TensorType:
+        {
+            at::Tensor t = value_node->t(torch::jit::attr::value);
+
+            if (t.dim() == 0 && t.numel() == 1)
+            {
+                if (t.scalar_type() == c10::ScalarType::Long)
+                {
+                    type = 2;
+                    int64_t i64 = t.item<int64_t>();
+                    if (i64 == std::numeric_limits<int64_t>::max()) i64 = INT_MAX;
+                    if (i64 == std::numeric_limits<int64_t>::min()) i64 = INT_MIN;
+                    i = (int)i64;
+                }
+                else if (t.scalar_type() == c10::ScalarType::Int)
+                {
+                    type = 2;
+                    i = t.item<int>();
+                }
+                else if (t.scalar_type() == c10::ScalarType::Double)
+                {
+                    type = 3;
+                    f = (float)t.item<double>();
+                }
+                else if (t.scalar_type() == c10::ScalarType::Float)
+                {
+                    type = 3;
+                    f = t.item<float>();
+                }
+                else if (t.scalar_type() == c10::ScalarType::ComplexDouble)
+                {
+                    type = 10;
+                    c = std::complex<float>(t.item<c10::complex<double> >());
+                }
+                else if (t.scalar_type() == c10::ScalarType::ComplexFloat)
+                {
+                    type = 10;
+                    c = std::complex<float>(t.item<c10::complex<float> >());
+                }
+                else
+                {
+                    fprintf(stderr, "unknown Parameter value kind %s of TensorType, t.dim = 0\n", value_node->kind().toDisplayString());
+                }
+            }
+            else
+            {
+                // constant tensor will become pnnx attribute node later
+                type = 8;
+            }
+
+            break;
+        }
+        case c10::TypeKind::ListType:
+        {
+            switch (value_node->output()->type()->containedTypes()[0]->kind())
+            {
+            case c10::TypeKind::IntType:
+            {
+                type = 5;
+                std::vector<int64_t> i64s = value_node->ival(torch::jit::attr::value).toIntVector();
+                for (auto i64 : i64s)
+                {
+                    if (i64 == std::numeric_limits<int64_t>::max()) i64 = INT_MAX;
+                    if (i64 == std::numeric_limits<int64_t>::min()) i64 = INT_MIN;
+                    ai.push_back(i64);
+                }
+                break;
+            }
+            case c10::TypeKind::FloatType:
+            {
+                type = 6;
+                std::vector<double> fs = value_node->ival(torch::jit::attr::value).toDoubleVector();
+                for (auto f : fs)
+                {
+                    af.push_back((float)f);
+                }
+                break;
+            }
+            default:
+            {
+                fprintf(stderr, "unknown Parameter value list element kind %s\n", c10::typeKindToString(value_node->output()->type()->containedTypes()[0]->kind()));
+                break;
+            }
+            }
+            break;
+        }
+        default:
+        {
+            fprintf(stderr, "unknown Parameter value kind %s\n", c10::typeKindToString(value_node->output()->type()->kind()));
+            break;
+        }
+        }
+    }
+    else if (value_node->kind() == c10::prim::ListConstruct)
+    {
+        switch (value_node->output()->type()->cast<c10::ListType>()->getElementType()->kind())
+        {
+        case c10::TypeKind::IntType:
+        {
+            type = 5;
+            for (const auto& x : value_node->inputs())
+            {
+                if (!x->node()->hasAttribute(torch::jit::attr::value))
+                {
+                    fprintf(stderr, "no attribute value in int list\n");
+                    ai.push_back(0);
+                    continue;
+                }
+
+                ai.push_back((int)x->node()->i(torch::jit::attr::value));
+            }
+            break;
+        }
+        case c10::TypeKind::FloatType:
+        {
+            type = 6;
+            for (const auto& x : value_node->inputs())
+            {
+                if (!x->node()->hasAttribute(torch::jit::attr::value))
+                {
+                    fprintf(stderr, "no attribute value in float list\n");
+                    af.push_back(0.f);
+                    continue;
+                }
+
+                af.push_back((float)x->node()->f(torch::jit::attr::value));
+            }
+            break;
+        }
+        case c10::TypeKind::StringType:
+        {
+            type = 7;
+            for (const auto& x : value_node->inputs())
+            {
+                if (!x->node()->hasAttribute(torch::jit::attr::value))
+                {
+                    fprintf(stderr, "no attribute value in string list\n");
+                    as.push_back("");
+                    continue;
+                }
+
+                as.push_back(x->node()->s(torch::jit::attr::value));
+            }
+            break;
+        }
+#if TORCH_VERSION_MAJOR >= 2 || (TORCH_VERSION_MAJOR >= 1 && TORCH_VERSION_MINOR >= 9)
+        case c10::TypeKind::ComplexType:
+        {
+            type = 11;
+            for (const auto& x : value_node->inputs())
+            {
+                if (!x->node()->hasAttribute(torch::jit::attr::value))
+                {
+                    fprintf(stderr, "no attribute value in complex list\n");
+                    ac.push_back(std::complex<float>(0.f, 0.f));
+                    continue;
+                }
+
+                ac.push_back(std::complex<float>(x->node()->c(torch::jit::attr::value)));
+            }
+            break;
+        }
+#endif
+        default:
+        {
+            fprintf(stderr, "unknown Parameter value list element kind %s\n", c10::typeKindToString(value_node->output()->type()->cast<c10::ListType>()->getElementType()->kind()));
+            break;
+        }
+        }
+    }
+    else
+    {
+        fprintf(stderr, "unknown Parameter value_node kind %s\n", value_node->kind().toDisplayString());
+    }
+}
+
+Parameter::Parameter(const torch::jit::Value* value)
+    : Parameter(value->node())
+{
+}
+
+Attribute::Attribute(const at::Tensor& t)
+{
+    type = get_at_tensor_type(t.scalar_type());
+
+    const int ndim = (int)t.dim();
+
+    if (ndim == 0)
+    {
+        shape = {1};
+
+        data.resize(type_to_elemsize(type));
+
+        if (t.scalar_type() == c10::ScalarType::Long)
+        {
+            int64_t i = t.item<int64_t>();
+            memcpy((void*)data.data(), (const void*)&i, data.size());
+        }
+        else if (t.scalar_type() == c10::ScalarType::Int)
+        {
+            int i = t.item<int>();
+            memcpy((void*)data.data(), (const void*)&i, data.size());
+        }
+        else if (t.scalar_type() == c10::ScalarType::Double)
+        {
+            double f = t.item<double>();
+            memcpy((void*)data.data(), (const void*)&f, data.size());
+        }
+        else if (t.scalar_type() == c10::ScalarType::Float)
+        {
+            float f = t.item<float>();
+            memcpy((void*)data.data(), (const void*)&f, data.size());
+        }
+        else
+        {
+            fprintf(stderr, "unknown Attribute tensor scalar type %d\n", type);
+        }
+
+        return;
+    }
+
+    shape.resize(ndim);
+    for (int i = 0; i < ndim; i++)
+        shape[i] = t.size(i);
+
+    if (shape.size() > 0)
+    {
+        data.resize(elemcount() * type_to_elemsize(type));
+        memcpy((void*)data.data(), (const void*)t.cpu().contiguous().data_ptr(), data.size());
+    }
+}
+
+Operand* Graph::new_operand(const torch::jit::Value* v)
+{
+    // Operand* r = new Operand;
+    // r->name = v->debugName();
+
+    Operand* r = new_operand(v->debugName());
+
+    r->type = -1;
+
+    auto pt = v->type()->cast<c10::TensorType>();
+    if (pt)
+    {
+        if (pt->scalarType().has_value() && pt->dim().has_value())
+        {
+            r->type = get_at_tensor_type(pt->scalarType().value());
+            const int ndim = (int)pt->dim().value();
+            r->shape.resize(ndim);
+            for (int i = 0; i < ndim; i++)
+            {
+                if (pt->sizes()[i].has_value())
+                    r->shape[i] = (int)pt->sizes()[i].value();
+                else
+                    r->shape[i] = -1;
+            }
+        }
+    }
+
+    // operands.push_back(r);
+    return r;
+}
+
+static c10::ScalarType input_type_to_c10_ScalarType(const std::string& t)
+{
+    if (t == "c64") return torch::kComplexFloat;
+    if (t == "c32") return torch::kComplexHalf;
+    if (t == "c128") return torch::kComplexDouble;
+    if (t == "f32") return torch::kFloat32;
+    if (t == "f16") return torch::kFloat16;
+    if (t == "f64") return torch::kFloat64;
+    if (t == "i32") return torch::kInt32;
+    if (t == "i16") return torch::kInt16;
+    if (t == "i64") return torch::kInt64;
+    if (t == "i8") return torch::kInt8;
+    if (t == "u8") return torch::kUInt8;
+
+    fprintf(stderr, "unsupported type %s fallback to f32\n", t.c_str());
+    return torch::kFloat32;
+}
+
+const torch::jit::Node* find_node_by_kind(const std::shared_ptr<torch::jit::Graph>& graph, const std::string& kind)
+{
+    for (const auto& n : graph->nodes())
+    {
+        if (n->kind().toDisplayString() == kind)
+            return n;
+    }
+
+    return 0;
+}
+
+int load_torchscript(const std::string& ptpath, Graph& pnnx_graph,
+                     const std::string& device,
+                     const std::vector<std::vector<int64_t> >& input_shapes,
+                     const std::vector<std::string>& input_types,
+                     const std::vector<std::vector<int64_t> >& input_shapes2,
+                     const std::vector<std::string>& input_types2,
+                     const std::vector<std::string>& customop_modules,
+                     const std::vector<std::string>& module_operators,
+                     const std::string& foldable_constants_zippath,
+                     std::set<std::string>& foldable_constants)
+{
+#ifdef PNNX_TORCHVISION
+    // call some vision api to register vision ops  :P
+    (void)vision::cuda_version();
+#endif
+
+    for (auto m : customop_modules)
+    {
+        fprintf(stderr, "load custom module %s\n", m.c_str());
+#if _WIN32
+        HMODULE handle = LoadLibraryExA(m.c_str(), NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
+        if (!handle)
+        {
+            fprintf(stderr, "LoadLibraryExA %s failed %d\n", m.c_str(), GetLastError());
+        }
+#else
+        void* handle = dlopen(m.c_str(), RTLD_LAZY);
+        if (!handle)
+        {
+            fprintf(stderr, "dlopen %s failed %s\n", m.c_str(), dlerror());
+        }
+#endif
+    }
+
+    std::vector<at::Tensor> input_tensors;
+    for (size_t i = 0; i < input_shapes.size(); i++)
+    {
+        const std::vector<int64_t>& shape = input_shapes[i];
+        const std::string& type = input_types[i];
+
+        at::Tensor t = torch::ones(shape, input_type_to_c10_ScalarType(type));
+        if (device == "gpu")
+            t = t.cuda();
+
+        input_tensors.push_back(t);
+    }
+
+    std::vector<at::Tensor> input_tensors2;
+    for (size_t i = 0; i < input_shapes2.size(); i++)
+    {
+        const std::vector<int64_t>& shape = input_shapes2[i];
+        const std::string& type = input_types2[i];
+
+        at::Tensor t = torch::ones(shape, input_type_to_c10_ScalarType(type));
+        if (device == "gpu")
+            t = t.cuda();
+
+        input_tensors2.push_back(t);
+    }
+
+    torch::jit::Module mod;
+
+    try
+    {
+        mod = torch::jit::load(ptpath, (device == "gpu") ? c10::kCUDA : c10::kCPU);
+    }
+    catch (const c10::Error& e)
+    {
+        fprintf(stderr, "Load torchscript failed: %s\n", e.what());
+
+        fprintf(stderr, "Please export model to torchscript as follows\n");
+        fprintf(stderr, "------------------------------------------\n");
+        fprintf(stderr, "import torch\n");
+        fprintf(stderr, "import torchvision.models as models\n\n");
+        fprintf(stderr, "net = models.resnet18(pretrained=True)\n");
+        fprintf(stderr, "net = net.eval()\n\n");
+        fprintf(stderr, "x = torch.rand(1, 3, 224, 224)\n");
+        fprintf(stderr, "mod = torch.jit.trace(net, x)\n");
+        fprintf(stderr, "mod.save(\"resnet18.pt\")\n");
+        fprintf(stderr, "------------------------------------------\n");
+
+        return -1;
+    }
+
+    mod.eval();
+
+    //     mod.dump(true, false, false);
+    //     mod.dump(true, true, true);
+
+    auto method = mod.find_method("forward");
+    if (!method)
+    {
+        auto methods = mod.get_methods();
+        if (methods.empty())
+        {
+            fprintf(stderr, "No method in torchscript\n");
+            return -1;
+        }
+
+        method = methods[0];
+        fprintf(stderr, "Use method %s as the entrypoint instead of forward\n", method->name().c_str());
+    }
+
+    auto g = method->graph();
+
+    // g->dump();
+
+    fprintf(stderr, "############# pass_level0\n");
+
+    pnnx::pass_level0(mod, g, input_tensors, input_tensors2, module_operators, ptpath, device, foldable_constants, foldable_constants_zippath);
+
+    // g->dump();
+
+    fprintf(stderr, "############# pass_level1\n");
+
+    pnnx::pass_level1(mod, g, module_operators, pnnx_graph);
+
+    return 0;
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/load_torchscript.h b/tools/pnnx/src/load_torchscript.h
new file mode 100644
index 00000000000..31a8a421723
--- /dev/null
+++ b/tools/pnnx/src/load_torchscript.h
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef PNNX_LOAD_TORCHSCRIPT_H
+#define PNNX_LOAD_TORCHSCRIPT_H
+
+#include "ir.h"
+
+namespace pnnx {
+
+int load_torchscript(const std::string& ptpath, Graph& g,
+                     const std::string& device,
+                     const std::vector<std::vector<int64_t> >& input_shapes,
+                     const std::vector<std::string>& input_types,
+                     const std::vector<std::vector<int64_t> >& input_shapes2,
+                     const std::vector<std::string>& input_types2,
+                     const std::vector<std::string>& customop_modules,
+                     const std::vector<std::string>& module_operators,
+                     const std::string& foldable_constants_zippath,
+                     std::set<std::string>& foldable_constants);
+
+} // namespace pnnx
+
+#endif // PNNX_LOAD_TORCHSCRIPT_H
diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index 17b5d3faef7..345107cb4fc 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -13,31 +13,25 @@
 // specific language governing permissions and limitations under the License.
 
 #include <stdio.h>
+#include <string.h>
 
-#if _WIN32
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
+#include <algorithm>
 #include <string>
 #include <vector>
 
-#include <torch/script.h>
-
-#ifdef PNNX_TORCHVISION
-// register torchvision ops via including headers
-#include <torchvision/vision.h>
-#endif
-
 #include "ir.h"
-#include "pass_level0.h"
-#include "pass_level1.h"
 #include "pass_level2.h"
 #include "pass_level3.h"
 #include "pass_level4.h"
 #include "pass_level5.h"
 
+#if BUILD_TORCH2PNNX
+#include "load_torchscript.h"
+#endif
+#if BUILD_ONNX2PNNX
+#include "load_onnx.h"
+#endif
+
 #include "pass_ncnn.h"
 #include "save_ncnn.h"
 
@@ -160,24 +154,6 @@ static void print_shape_list(const std::vector<std::vector<int64_t> >& shapes, c
     }
 }
 
-static c10::ScalarType input_type_to_c10_ScalarType(const std::string& t)
-{
-    if (t == "c64") return torch::kComplexFloat;
-    if (t == "c32") return torch::kComplexHalf;
-    if (t == "c128") return torch::kComplexDouble;
-    if (t == "f32") return torch::kFloat32;
-    if (t == "f16") return torch::kFloat16;
-    if (t == "f64") return torch::kFloat64;
-    if (t == "i32") return torch::kInt32;
-    if (t == "i16") return torch::kInt16;
-    if (t == "i64") return torch::kInt64;
-    if (t == "i8") return torch::kInt8;
-    if (t == "u8") return torch::kUInt8;
-
-    fprintf(stderr, "unsupported type %s fallback to f32\n", t.c_str());
-    return torch::kFloat32;
-}
-
 static void show_usage()
 {
     fprintf(stderr, "Usage: pnnx [model.pt] [(key=value)...]\n");
@@ -314,114 +290,17 @@ int main(int argc, char** argv)
         fprintf(stderr, "\n");
     }
 
-#ifdef PNNX_TORCHVISION
-    // call some vision api to register vision ops  :P
-    (void)vision::cuda_version();
-#endif
-
-    for (auto m : customop_modules)
-    {
-        fprintf(stderr, "load custom module %s\n", m.c_str());
-#if _WIN32
-        HMODULE handle = LoadLibraryExA(m.c_str(), NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
-        if (!handle)
-        {
-            fprintf(stderr, "LoadLibraryExA %s failed %d\n", m.c_str(), GetLastError());
-        }
-#else
-        void* handle = dlopen(m.c_str(), RTLD_LAZY);
-        if (!handle)
-        {
-            fprintf(stderr, "dlopen %s failed %s\n", m.c_str(), dlerror());
-        }
-#endif
-    }
-
-    std::vector<at::Tensor> input_tensors;
-    for (size_t i = 0; i < input_shapes.size(); i++)
-    {
-        const std::vector<int64_t>& shape = input_shapes[i];
-        const std::string& type = input_types[i];
-
-        at::Tensor t = torch::ones(shape, input_type_to_c10_ScalarType(type));
-        if (device == "gpu")
-            t = t.cuda();
-
-        input_tensors.push_back(t);
-    }
-
-    std::vector<at::Tensor> input_tensors2;
-    for (size_t i = 0; i < input_shapes2.size(); i++)
-    {
-        const std::vector<int64_t>& shape = input_shapes2[i];
-        const std::string& type = input_types2[i];
-
-        at::Tensor t = torch::ones(shape, input_type_to_c10_ScalarType(type));
-        if (device == "gpu")
-            t = t.cuda();
-
-        input_tensors2.push_back(t);
-    }
-
-    torch::jit::Module mod;
-
-    try
-    {
-        mod = torch::jit::load(ptpath, (device == "gpu") ? c10::kCUDA : c10::kCPU);
-    }
-    catch (const c10::Error& e)
-    {
-        fprintf(stderr, "Load torchscript failed: %s\n", e.what());
-
-        fprintf(stderr, "Please export model to torchscript as follows\n");
-        fprintf(stderr, "------------------------------------------\n");
-        fprintf(stderr, "import torch\n");
-        fprintf(stderr, "import torchvision.models as models\n\n");
-        fprintf(stderr, "net = models.resnet18(pretrained=True)\n");
-        fprintf(stderr, "net = net.eval()\n\n");
-        fprintf(stderr, "x = torch.rand(1, 3, 224, 224)\n");
-        fprintf(stderr, "mod = torch.jit.trace(net, x)\n");
-        fprintf(stderr, "mod.save(\"resnet18.pt\")\n");
-        fprintf(stderr, "------------------------------------------\n");
-
-        return -1;
-    }
-
-    mod.eval();
-
-    //     mod.dump(true, false, false);
-    //     mod.dump(true, true, true);
-
-    auto method = mod.find_method("forward");
-    if (!method)
-    {
-        auto methods = mod.get_methods();
-        if (methods.empty())
-        {
-            fprintf(stderr, "No method in torchscript\n");
-            return -1;
-        }
-
-        method = methods[0];
-        fprintf(stderr, "Use method %s as the entrypoint instead of forward\n", method->name().c_str());
-    }
-
-    auto g = method->graph();
-
-    //     g->dump();
-
-    fprintf(stderr, "############# pass_level0\n");
-
     std::set<std::string> foldable_constants;
     std::string foldable_constants_zippath = ptbase + ".foldable_constants.zip";
-    pnnx::pass_level0(mod, g, input_tensors, input_tensors2, module_operators, ptpath, device, foldable_constants, foldable_constants_zippath);
-
-    //     g->dump();
-
-    fprintf(stderr, "############# pass_level1\n");
 
     pnnx::Graph pnnx_graph;
-    pnnx::pass_level1(mod, g, module_operators, pnnx_graph);
+    load_torchscript(ptpath, pnnx_graph,
+                     device, input_shapes, input_types,
+                     input_shapes2, input_types2,
+                     customop_modules, module_operators,
+                     foldable_constants_zippath, foldable_constants);
+
+    // load_onnx(ptpath.c_str(), pnnx_graph);
 
     //     g->dump();
 
diff --git a/tools/pnnx/src/onnx.proto b/tools/pnnx/src/onnx.proto
index 461bd0b78cd..15012ce65c3 100644
--- a/tools/pnnx/src/onnx.proto
+++ b/tools/pnnx/src/onnx.proto
@@ -3,8 +3,8 @@
 //
 
 
-// Copyright (c) ONNX Project Contributors.
-// Licensed under the MIT license.
+// SPDX-License-Identifier: Apache-2.0
+
 
 syntax = "proto2";
 
@@ -20,23 +20,16 @@ package onnx;
 //
 // This document describes the syntax of models and their computation graphs,
 // as well as the standard data types. Together, they are referred to as the ONNX
-// Intermediate Representation, or 'IR' for short. 
+// Intermediate Representation, or 'IR' for short.
 //
 // The normative semantic specification of the ONNX IR is found in docs/IR.md.
 // Definitions of the built-in neural network operators may be found in docs/Operators.md.
 
 // Notes
 //
-// Release
-//
-// We are still in the very early stage of defining ONNX. The current
-// version of ONNX is a starting point. While we are actively working
-// towards a complete spec, we would like to get the community involved
-// by sharing our working version of ONNX.
-//
 // Protobuf compatibility
-// 
-// To simplify framework compatibility, ONNX is defined using the subset of protobuf 
+//
+// To simplify framework compatibility, ONNX is defined using the subset of protobuf
 // that is compatible with both protobuf v2 and v3. This means that we do not use any
 // protobuf features that are only available in one of the two versions.
 //
@@ -60,8 +53,8 @@ enum Version {
   _START_VERSION = 0;
   // The version field is always serialized and we will use it to store the
   // version that the  graph is generated from. This helps us set up version
-  // control. 
-  // For the IR, we are using simple numbers starting with with 0x00000001, 
+  // control.
+  // For the IR, we are using simple numbers starting with 0x00000001,
   // which was the version we published on Oct 10, 2017.
   IR_VERSION_2017_10_10 = 0x0000000000000001;
 
@@ -84,7 +77,36 @@ enum Version {
   // IR VERSION 5 published on March 18, 2019
   // - Add message TensorAnnotation.
   // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters.
-  IR_VERSION = 0x0000000000000005;
+  IR_VERSION_2019_3_18 = 0x0000000000000005;
+
+  // IR VERSION 6 published on Sep 19, 2019
+  // - Add support for sparse tensor constants stored in model.
+  //   - Add message SparseTensorProto
+  //   - Add sparse initializers
+  IR_VERSION_2019_9_19 = 0x0000000000000006;
+
+  // IR VERSION 7 published on May 8, 2020
+  // - Add support to allow function body graph to rely on multiple external opreator sets.
+  // - Add a list to promote inference graph's initializers to global and
+  //   mutable variables. Global variables are visible in all graphs of the
+  //   stored models.
+  // - Add message TrainingInfoProto to store initialization
+  //   method and training algorithm. The execution of TrainingInfoProto
+  //   can modify the values of mutable variables.
+  // - Implicitly add inference graph into each TrainingInfoProto's algorithm.
+  IR_VERSION_2020_5_8 = 0x0000000000000007;
+
+  // IR VERSION 8 published on July 30, 2021
+  // Introduce TypeProto.SparseTensor
+  // Introduce TypeProto.Optional
+  // Added a list of FunctionProtos local to the model
+  // Deprecated since_version and operator status from FunctionProto
+  IR_VERSION_2021_7_30 = 0x0000000000000008;
+
+  // IR VERSION 9 published on May 5, 2023
+  // Added AttributeProto to FunctionProto so that default attribute values can be set.
+  // Added FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ.
+  IR_VERSION = 0x0000000000000009;
 }
 
 // Attributes
@@ -94,6 +116,8 @@ enum Version {
 // An AttributeProto MUST contain the name field, and *only one* of the
 // following content fields, effectively enforcing a C/C++ union equivalent.
 message AttributeProto {
+  reserved 12, 16 to 19;
+  reserved "v";
 
   // Note: this enum is structurally identical to the OpSchema::AttrType
   // enum defined in schema.h.  If you rev one, you likely need to rev the other.
@@ -104,17 +128,21 @@ message AttributeProto {
     STRING = 3;
     TENSOR = 4;
     GRAPH = 5;
+    SPARSE_TENSOR = 11;
+    TYPE_PROTO = 13;
 
     FLOATS = 6;
     INTS = 7;
     STRINGS = 8;
     TENSORS = 9;
     GRAPHS = 10;
+    SPARSE_TENSORS = 12;
+    TYPE_PROTOS = 14;
   }
 
   // The name field MUST be present for this version of the IR.
   optional string name = 1;           // namespace Attribute
- 
+
   // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function.
   // In this case, this AttributeProto does not contain data, and it's a reference of attribute
   // in parent scope.
@@ -126,7 +154,7 @@ message AttributeProto {
 
   // The type field MUST be present for this version of the IR.
   // For 0.0.1 versions of the IR, this field was not defined, and
-  // implementations needed to use has_field hueristics to determine
+  // implementations needed to use has_field heuristics to determine
   // which value field was in use.  For IR_VERSION 0.0.2 or later, this
   // field MUST be set and match the f|i|s|t|... field in use.  This
   // change was made to accommodate proto3 implementations.
@@ -138,14 +166,18 @@ message AttributeProto {
   optional bytes s = 4;               // UTF-8 string
   optional TensorProto t = 5;         // tensor value
   optional GraphProto g = 6;          // graph
+  optional SparseTensorProto sparse_tensor = 22;  // sparse tensor value
   // Do not use field below, it's deprecated.
   // optional ValueProto v = 12;         // value - subsumes everything but graph
+  optional TypeProto tp = 14;          // type proto
 
   repeated float floats = 7;          // list of floats
   repeated int64 ints = 8;            // list of ints
   repeated bytes strings = 9;         // list of UTF-8 strings
   repeated TensorProto tensors = 10;  // list of tensors
   repeated GraphProto graphs = 11;    // list of graph
+  repeated SparseTensorProto sparse_tensors = 23; // list of sparse tensors
+  repeated TypeProto type_protos = 15;// list of type protos
 }
 
 // Defines information on value, including the name, the type, and
@@ -153,7 +185,8 @@ message AttributeProto {
 message ValueInfoProto {
   // This field MUST be present in this version of the IR.
   optional string name = 1;     // namespace Value
-  // This field MUST be present in this version of the IR.
+  // This field MUST be present in this version of the IR for
+  // inputs and outputs of the top-level graph.
   optional TypeProto type = 2;
   // A human-readable documentation for this value. Markdown is allowed.
   optional string doc_string = 3;
@@ -164,7 +197,7 @@ message ValueInfoProto {
 // Computation graphs are made up of a DAG of nodes, which represent what is
 // commonly called a "layer" or "pipeline stage" in machine learning frameworks.
 //
-// For example, it can be a node of type "Conv" that takes in an image, a filter 
+// For example, it can be a node of type "Conv" that takes in an image, a filter
 // tensor and a bias tensor, and produces the convolved output.
 message NodeProto {
   repeated string input = 1;    // namespace Value
@@ -186,12 +219,130 @@ message NodeProto {
   optional string doc_string = 6;
 }
 
+// Training information
+// TrainingInfoProto stores information for training a model.
+// In particular, this defines two functionalities: an initialization-step
+// and a training-algorithm-step. Initialization resets the model
+// back to its original state as if no training has been performed.
+// Training algorithm improves the model based on input data.
+//
+// The semantics of the initialization-step is that the initializers
+// in ModelProto.graph and in TrainingInfoProto.algorithm are first
+// initialized as specified by the initializers in the graph, and then
+// updated by the "initialization_binding" in every instance in
+// ModelProto.training_info.
+//
+// The field "algorithm" defines a computation graph which represents a
+// training algorithm's step. After the execution of a
+// TrainingInfoProto.algorithm, the initializers specified by "update_binding"
+// may be immediately updated. If the targeted training algorithm contains
+// consecutive update steps (such as block coordinate descent methods),
+// the user needs to create a TrainingInfoProto for each step.
+message TrainingInfoProto {
+  // This field describes a graph to compute the initial tensors
+  // upon starting the training process. Initialization graph has no input
+  // and can have multiple outputs. Usually, trainable tensors in neural
+  // networks are randomly initialized. To achieve that, for each tensor,
+  // the user can put a random number operator such as RandomNormal or
+  // RandomUniform in TrainingInfoProto.initialization.node and assign its
+  // random output to the specific tensor using "initialization_binding".
+  // This graph can also set the initializers in "algorithm" in the same
+  // TrainingInfoProto; a use case is resetting the number of training
+  // iteration to zero.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output. Thus, no initializer would be changed by default.
+  optional GraphProto initialization = 1;
+
+  // This field represents a training algorithm step. Given required inputs,
+  // it computes outputs to update initializers in its own or inference graph's
+  // initializer lists. In general, this field contains loss node, gradient node,
+  // optimizer node, increment of iteration count.
+  //
+  // An execution of the training algorithm step is performed by executing the
+  // graph obtained by combining the inference graph (namely "ModelProto.graph")
+  // and the "algorithm" graph. That is, the actual
+  // input/initializer/output/node/value_info/sparse_initializer list of
+  // the training graph is the concatenation of
+  // "ModelProto.graph.input/initializer/output/node/value_info/sparse_initializer"
+  // and "algorithm.input/initializer/output/node/value_info/sparse_initializer"
+  // in that order. This combined graph must satisfy the normal ONNX conditions.
+  // Now, let's provide a visualization of graph combination for clarity.
+  // Let the inference graph (i.e., "ModelProto.graph") be
+  //    tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d
+  // and the "algorithm" graph be
+  //    tensor_d -> Add -> tensor_e
+  // The combination process results
+  //    tensor_a, tensor_b -> MatMul -> tensor_c -> Sigmoid -> tensor_d -> Add -> tensor_e
+  //
+  // Notice that an input of a node in the "algorithm" graph may reference the
+  // output of a node in the inference graph (but not the other way round). Also, inference
+  // node cannot reference inputs of "algorithm". With these restrictions, inference graph
+  // can always be run independently without training information.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output. Evaluating the default training step never
+  // update any initializers.
+  optional GraphProto algorithm = 2;
+
+  // This field specifies the bindings from the outputs of "initialization" to
+  // some initializers in "ModelProto.graph.initializer" and
+  // the "algorithm.initializer" in the same TrainingInfoProto.
+  // See "update_binding" below for details.
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "initialization".
+  repeated StringStringEntryProto initialization_binding = 3;
+
+  // Gradient-based training is usually an iterative procedure. In one gradient
+  // descent iteration, we apply
+  //
+  // x = x - r * g
+  //
+  // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is
+  // gradient of "x" with respect to a chosen loss. To avoid adding assignments
+  // into the training graph, we split the update equation into
+  //
+  // y = x - r * g
+  // x = y
+  //
+  // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To
+  // tell that "y" should be assigned to "x", the field "update_binding" may
+  // contain a key-value pair of strings, "x" (key of StringStringEntryProto)
+  // and "y" (value of StringStringEntryProto).
+  // For a neural network with multiple trainable (mutable) tensors, there can
+  // be multiple key-value pairs in "update_binding".
+  //
+  // The initializers appears as keys in "update_binding" are considered
+  // mutable variables. This implies some behaviors
+  // as described below.
+  //
+  //  1. We have only unique keys in all "update_binding"s so that two
+  //     variables may not have the same name. This ensures that one
+  //     variable is assigned up to once.
+  //  2. The keys must appear in names of "ModelProto.graph.initializer" or
+  //     "TrainingInfoProto.algorithm.initializer".
+  //  3. The values must be output names of "algorithm" or "ModelProto.graph.output".
+  //  4. Mutable variables are initialized to the value specified by the
+  //     corresponding initializer, and then potentially updated by
+  //     "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s.
+  //
+  // This field usually contains names of trainable tensors
+  // (in ModelProto.graph), optimizer states such as momentums in advanced
+  // stochastic gradient methods (in TrainingInfoProto.graph),
+  // and number of training iterations (in TrainingInfoProto.graph).
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "algorithm".
+  repeated StringStringEntryProto update_binding = 4;
+}
+
 // Models
 //
 // ModelProto is a top-level file/container format for bundling a ML model and
 // associating its computation graph with metadata.
 //
-// The semantics of the model are described by the associated GraphProto.
+// The semantics of the model are described by the associated GraphProto's.
 message ModelProto {
   // The version of the IR this model targets. See Version enum above.
   // This field MUST be present.
@@ -236,13 +387,42 @@ message ModelProto {
 
   // Named metadata values; keys should be distinct.
   repeated StringStringEntryProto metadata_props = 14;
+
+  // Training-specific information. Sequentially executing all stored
+  // `TrainingInfoProto.algorithm`s and assigning their outputs following
+  // the corresponding `TrainingInfoProto.update_binding`s is one training
+  // iteration. Similarly, to initialize the model
+  // (as if training hasn't happened), the user should sequentially execute
+  // all stored `TrainingInfoProto.initialization`s and assigns their outputs
+  // using `TrainingInfoProto.initialization_binding`s.
+  //
+  // If this field is empty, the training behavior of the model is undefined.
+  repeated TrainingInfoProto training_info = 20;
+
+  // A list of function protos local to the model.
+  //
+  // Name of the function "FunctionProto.name" should be unique within the domain "FunctionProto.domain".
+  // In case of any conflicts the behavior (whether the model local functions are given higher priority,
+  // or standard operator sets are given higher priotity or this is treated as error) is defined by
+  // the runtimes.
+  //
+  // The operator sets imported by FunctionProto should be compatible with the ones
+  // imported by ModelProto and other model local FunctionProtos.
+  // Example, if same operator set say 'A' is imported by a FunctionProto and ModelProto
+  // or by 2 FunctionProtos then versions for the operator set may be different but,
+  // the operator schema returned for op_type, domain, version combination
+  // for both the versions should be same for every node in the function body.
+  //
+  // One FunctionProto can reference other FunctionProto in the model, however, recursive reference
+  // is not allowed.
+  repeated FunctionProto functions = 25;
 };
 
 // StringStringEntryProto follows the pattern for cross-proto-version maps.
 // See https://developers.google.com/protocol-buffers/docs/proto3#maps
 message StringStringEntryProto {
   optional string key = 1;
-  optional string value= 2;
+  optional string value = 2;
 };
 
 message TensorAnnotation {
@@ -258,7 +438,7 @@ message TensorAnnotation {
 
 // Graphs
 //
-// A graph defines the computational logic of a model and is comprised of a parameterized 
+// A graph defines the computational logic of a model and is comprised of a parameterized
 // list of nodes that form a directed acyclic graph based on their inputs and outputs.
 // This is the equivalent of the "network" or "graph" in many deep learning
 // frameworks.
@@ -270,10 +450,14 @@ message GraphProto {
   optional string name = 2;   // namespace Graph
 
   // A list of named tensor values, used to specify constant inputs of the graph.
-  // Each TensorProto entry must have a distinct name (within the list) that
-  // MAY also appear in the input list.
+  // Each initializer (both TensorProto as well SparseTensorProto) MUST have a name.
+  // The name MUST be unique across both initializer and sparse_initializer,
+  // but the name MAY also appear in the input list.
   repeated TensorProto initializer = 5;
 
+  // Initializers (see above) stored in sparse format.
+  repeated SparseTensorProto sparse_initializer = 15;
+
   // A human-readable documentation for this graph. Markdown is allowed.
   optional string doc_string = 10;
 
@@ -291,13 +475,8 @@ message GraphProto {
   // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model.
   repeated TensorAnnotation quantization_annotation = 14;
 
-  // DO NOT USE the following fields, they were deprecated from earlier versions.
-  // repeated string input = 3;
-  // repeated string output = 4;
-  // optional int64 ir_version = 6;
-  // optional int64 producer_version = 7;
-  // optional string producer_tag = 8;
-  // optional string domain = 9;
+  reserved 3, 4, 6 to 9;
+  reserved "ir_version", "producer_version", "producer_tag", "domain";
 }
 
 // Tensors
@@ -332,6 +511,17 @@ message TensorProto {
     // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits.
     BFLOAT16 = 16;
 
+    // Non-IEEE floating-point format based on papers
+    // FP8 Formats for Deep Learning, https://arxiv.org/abs/2209.05433,
+    // 8-bit Numerical Formats For Deep Neural Networks, https://arxiv.org/pdf/2206.02915.pdf.
+    // Operators supported FP8 are Cast, CastLike, QuantizeLinear, DequantizeLinear.
+    // The computation usually happens inside a block quantize / dequantize
+    // fused by the runtime.
+    FLOAT8E4M3FN = 17;    // float 8, mostly used for coefficients, supports nan, not inf
+    FLOAT8E4M3FNUZ = 18;  // float 8, mostly used for coefficients, supports nan, not inf, no negative zero
+    FLOAT8E5M2 = 19;      // follows IEEE 754, supports nan, inf, mostly used for gradients
+    FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, inf, mostly used for gradients, no negative zero
+
     // Future extensions go here.
   }
 
@@ -359,17 +549,17 @@ message TensorProto {
   // For float and complex64 values
   // Complex64 tensors are encoded as a single array of floats,
   // with the real components appearing in odd numbered positions,
-  // and the corresponding imaginary component apparing in the
+  // and the corresponding imaginary component appearing in the
   // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
   // is encoded as [1.0, 2.0 ,3.0 ,4.0]
   // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
   repeated float float_data = 4 [packed = true];
 
-  // For int32, uint8, int8, uint16, int16, bool, and float16 values
-  // float16 values must be bit-wise converted to an uint16_t prior
+  // For int32, uint8, int8, uint16, int16, bool, float8, and float16 values
+  // float16 and float8 values must be bit-wise converted to an uint16_t prior
   // to writing to the buffer.
   // When this field is present, the data_type field MUST be
-  // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16
+  // INT32, INT16, INT8, UINT16, UINT8, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ
   repeated int32 int32_data = 5 [packed = true];
 
   // For strings.
@@ -431,7 +621,7 @@ message TensorProto {
   // For double
   // Complex128 tensors are encoded as a single array of doubles,
   // with the real components appearing in odd numbered positions,
-  // and the corresponding imaginary component apparing in the
+  // and the corresponding imaginary component appearing in the
   // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i]
   // is encoded as [1.0, 2.0 ,3.0 ,4.0]
   // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128
@@ -443,6 +633,30 @@ message TensorProto {
   repeated uint64 uint64_data = 11 [packed = true];
 }
 
+// A serialized sparse-tensor value
+message SparseTensorProto {
+  // The sequence of non-default values are encoded as a tensor of shape [NNZ].
+  // The default-value is zero for numeric tensors, and empty-string for string tensors.
+  // values must have a non-empty name present which serves as a name for SparseTensorProto
+  // when used in sparse_initializer list.
+  optional TensorProto values = 1;
+
+  // The indices of the non-default values, which may be stored in one of two formats.
+  // (a) Indices can be a tensor of shape [NNZ, rank] with the [i,j]-th value
+  // corresponding to the j-th index of the i-th value (in the values tensor).
+  // (b) Indices can be a tensor of shape [NNZ], in which case the i-th value
+  // must be the linearized-index of the i-th value (in the values tensor).
+  // The linearized-index can be converted into an index tuple (k_1,...,k_rank)
+  // using the shape provided below.
+  // The indices must appear in ascending order without duplication.
+  // In the first format, the ordering is lexicographic-ordering:
+  // e.g., index-value [1,4] must appear before [2,1]
+  optional TensorProto indices = 2;
+
+  // The shape of the underlying dense-tensor: [dim_1, dim_2, ... dim_rank]
+  repeated int64 dims = 3;
+}
+
 // Defines a tensor shape. A dimension can be either an integer value
 // or a symbolic variable. A symbolic variable represents an unknown
 // dimension.
@@ -455,7 +669,7 @@ message TensorShapeProto {
     // Standard denotation can optionally be used to denote tensor
     // dimensions with standard semantic descriptions to ensure
     // that operations are applied to the correct axis of a tensor.
-    // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition
+    // Refer to https://github.com/onnx/onnx/blob/main/docs/DimensionDenotation.md#denotation-definition
     // for pre-defined dimension denotations.
     optional string denotation = 3;
   };
@@ -475,16 +689,68 @@ message TypeProto {
     optional TensorShapeProto shape = 2;
   }
 
+  // repeated T
+  message Sequence {
+    // The type and optional shape of each element of the sequence.
+    // This field MUST be present for this version of the IR.
+    optional TypeProto elem_type = 1;
+  };
+
+  // map<K,V>
+  message Map {
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    // This field MUST refer to an integral type ([U]INT{8|16|32|64}) or STRING
+    optional int32 key_type = 1;
+    // This field MUST be present for this version of the IR.
+    optional TypeProto value_type = 2;
+  };
+
+  // wrapper for Tensor, Sequence, or Map
+  message Optional {
+    // The type and optional shape of the element wrapped.
+    // This field MUST be present for this version of the IR.
+    // Possible values correspond to OptionalProto.DataType enum
+    optional TypeProto elem_type = 1;
+  };
+
+
+  message SparseTensor {
+    // This field MUST NOT have the value of UNDEFINED
+    // This field MUST have a valid TensorProto.DataType value
+    // This field MUST be present for this version of the IR.
+    optional int32 elem_type = 1;
+    optional TensorShapeProto shape = 2;
+  }
+
 
   oneof value {
     // The type of a tensor.
     Tensor tensor_type = 1;
 
+    // NOTE:  DNN-only implementations of ONNX MAY elect to not support non-tensor values
+    //        as input and output to graphs and nodes. These types are needed to naturally
+    //        support classical ML operators.  DNN operators SHOULD restrict their input
+    //        and output types to tensors.
+
+    // The type of a sequence.
+    Sequence sequence_type = 4;
+
+    // The type of a map.
+    Map map_type = 5;
+
+    // The type of an optional.
+    Optional optional_type = 9;
+
+
+    // Type of the sparse tensor
+    SparseTensor sparse_tensor_type = 8;
+
   }
 
-  // An optional denotation can be used to denote the whole 
-  // type with a standard semantic description as to what is 
-  // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition
+  // An optional denotation can be used to denote the whole
+  // type with a standard semantic description as to what is
+  // stored inside. Refer to https://github.com/onnx/onnx/blob/main/docs/TypeDenotation.md#type-denotation-definition
   // for pre-defined type denotations.
   optional string denotation = 6;
 }
@@ -503,3 +769,68 @@ message OperatorSetIdProto {
   // This field MUST be present in this version of the IR.
   optional int64 version = 2;
 }
+
+// Operator/function status.
+enum OperatorStatus {
+    EXPERIMENTAL = 0;
+    STABLE = 1;
+}
+
+message FunctionProto {
+  // The name of the function, similar usage of op_type in OperatorProto.
+  // Combined with FunctionProto.domain, this forms the unique identity of
+  // the FunctionProto.
+  optional string name = 1;
+
+  // Deprecated since IR Version 8
+  // optional int64 since_version = 2;
+  reserved 2;
+  reserved "since_version";
+
+  // Deprecated since IR Version 8
+  // optional OperatorStatus status = 3;
+  reserved 3;
+  reserved "status";
+
+  // The inputs and outputs of the function.
+  repeated string input = 4;
+  repeated string output = 5;
+
+  // The attribute parameters of the function.
+  // It is for function parameters without default values.
+  repeated string attribute = 6;
+
+  // The attribute protos of the function.
+  // It is for function attributes with default values.
+  // A function attribute shall be represented either as
+  // a string attribute or an AttributeProto, not both.
+  repeated AttributeProto attribute_proto = 11;
+
+  // The nodes in the function.
+  repeated NodeProto node = 7;
+  // A human-readable documentation for this function. Markdown is allowed.
+  optional string doc_string = 8;
+
+  // The OperatorSets this function body (graph) relies on.
+  //
+  // All nodes in the function body (graph) will bind against the operator
+  // with the same-domain/same-op_type operator with the HIGHEST version
+  // in the referenced operator sets. This means at most one version can be relied
+  // for one domain.
+  //
+  // The operator sets imported by FunctionProto should be compatible with the ones
+  // imported by ModelProto. Example, if same operator set say 'A' is imported by FunctionProto
+  // and ModelProto then versions for the operator set may be different but,
+  // the operator schema returned for op_type, domain, version combination
+  // for both the versions should be same.
+
+  repeated OperatorSetIdProto opset_import = 9;
+
+  // The domain which this function belongs to. Combined with FunctionProto.name, this forms the unique identity of
+  // the FunctionProto.
+  optional string domain = 10;
+}
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/tools/pnnx/src/pass_level0.cpp b/tools/pnnx/src/pass_level0.cpp
index 10d7145efc4..2a1e4727696 100644
--- a/tools/pnnx/src/pass_level0.cpp
+++ b/tools/pnnx/src/pass_level0.cpp
@@ -26,12 +26,12 @@ void pass_level0(const torch::jit::Module& mod, std::shared_ptr<torch::jit::Grap
 {
     inline_block(g, module_operators);
 
+    constant_unpooling(g);
+
     reset_device(g, device);
 
     flatten_input(g);
 
-    constant_unpooling(g);
-
     if (!input_tensors.empty())
     {
         shape_inference(mod, g, input_tensors, input_tensors2, module_operators, ptpath, device, foldable_constants, foldable_constants_zippath);
diff --git a/tools/pnnx/src/pass_level1/nn_Conv2d.cpp b/tools/pnnx/src/pass_level1/nn_Conv2d.cpp
index 839858d83c6..26b5f86f1a7 100644
--- a/tools/pnnx/src/pass_level1/nn_Conv2d.cpp
+++ b/tools/pnnx/src/pass_level1/nn_Conv2d.cpp
@@ -126,10 +126,10 @@ class Conv2d : public FuseModulePass
             op->params["padding"] = convolution->namedInput("padding");
         }
         op->params["dilation"] = convolution->namedInput("dilation");
-        op->params["bias"] = mod.hasattr("bias");
+        op->params["bias"] = mod.hasattr("bias") && mod.attr("bias").isTensor();
 
         op->attrs["weight"] = weight;
-        if (mod.hasattr("bias"))
+        if (mod.hasattr("bias") && mod.attr("bias").isTensor())
         {
             op->attrs["bias"] = mod.attr("bias").toTensor();
         }
diff --git a/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp b/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp
index b68e2a1e0e8..dd4f3670aae 100644
--- a/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp
+++ b/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp
@@ -35,9 +35,8 @@ class MultiheadAttention : public FuseModulePass
 
     void write(Operator* op, const std::shared_ptr<torch::jit::Graph>& graph, const torch::jit::Module& mod) const
     {
-        //         mod.dump(false, false, false);
-
-        //         graph->dump();
+        // mod.dump(false, false, false);
+        // graph->dump();
 
         const torch::jit::Node* multi_head_attention = find_node_by_kind(graph, "aten::_native_multi_head_attention");
         if (multi_head_attention)
@@ -89,6 +88,18 @@ class MultiheadAttention : public FuseModulePass
                 op->params["add_zero_attn"] = false;
             }
 
+            const torch::jit::Node* scaled_dot_product_attention = find_node_by_kind(graph, "aten::scaled_dot_product_attention");
+            if (scaled_dot_product_attention)
+            {
+                if (scaled_dot_product_attention->input(3)->type()->kind() != c10::TypeKind::NoneType)
+                {
+                    size_t input_count = op->inputs.size();
+                    op->inputnames.resize(input_count);
+                    op->inputnames[input_count - 1] = "attn_mask";
+                }
+            }
+
+            // find attention mask addition pattern pre torch-2.1
             const torch::jit::Node* has_attn_mask = find_node_by_kind(graph, "aten::baddbmm");
             if (has_attn_mask)
             {
diff --git a/tools/pnnx/src/pass_level2.cpp b/tools/pnnx/src/pass_level2.cpp
index 1ff4451e5cd..d8feb795812 100644
--- a/tools/pnnx/src/pass_level2.cpp
+++ b/tools/pnnx/src/pass_level2.cpp
@@ -16,7 +16,9 @@
 
 #include <algorithm>
 #include <map>
+#include <set>
 #include <unordered_map>
+#include <unordered_set>
 
 namespace pnnx {
 
@@ -821,6 +823,8 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde
         int q = graph_op_count - 1;
         for (; q >= 1; q--)
         {
+            matched = true;
+
             for (const Operator* pattern : pattern_graph_output_operators)
             {
                 for (size_t i = 0; i < pattern->inputs.size(); i++)
@@ -900,6 +904,7 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde
                 matched_outputs.clear();
                 captured_params.clear();
                 captured_attrs.clear();
+                matched = false;
                 continue;
             }
 
@@ -965,6 +970,20 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde
             delete r;
         }
 
+        // insert new operator at the last matched one
+        const Operator* cur = 0;
+        {
+            int cur_index = 1;
+            for (auto& o : matched_operators)
+            {
+                int c_index = std::find(graph.ops.begin(), graph.ops.end(), o.second) - graph.ops.begin();
+                cur_index = std::max(cur_index, c_index + 1);
+            }
+
+            cur_index = std::min(cur_index, (int)graph.ops.size() - 1);
+            cur = graph.ops[cur_index];
+        }
+
         // remove all matched_operators
         for (auto& _x : matched_operators)
         {
@@ -977,22 +996,6 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde
             delete _x.second;
         }
 
-        // insert new operator before all output consumers
-        const Operator* cur = 0;
-        {
-            int cur_index = graph.ops.size() - 1;
-            for (auto& o : matched_outputs)
-            {
-                for (auto& c : o.second->consumers)
-                {
-                    int c_index = std::find(graph.ops.begin(), graph.ops.end(), c) - graph.ops.begin();
-                    cur_index = std::min(cur_index, c_index);
-                }
-            }
-
-            cur = graph.ops[cur_index];
-        }
-
         if (pass->replace_pattern_graph() == 0)
         {
             // insert single
@@ -1107,112 +1110,302 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde
     }
 }
 
-static void fix_inplace_copy_output(Graph& graph)
+static bool is_alias_op(const Operator* op)
 {
-    while (1)
+    if (op->type == "aten::slice" || op->type == "aten::select")
+        return true;
+
+    if (op->type == "aten::view")
+        return true;
+
+    return false;
+}
+
+static void functionize(Graph& graph)
+{
+    // graph.save("0.param", "0.bin");
+
+    // 1. create shadow view/slice/select/... for each consumer
+    // 2. replace inplace op, append copy
+    // 3. tag operand alias for view/slice/select/... output
+    // 4. scan inplace op, collect affacted alias
+    //     5. look for any op after the inplace op with alias input
+    //     6. collect ops on the chain back to alias
+    //     7. move chain after copy op
+    //     8. update all alias uses after copy op, retag alias
+    // 9. clear all alias tag
+
+    // 1. create shadow view/slice/select/... for each consumer
     {
-        bool matched = false;
-        for (size_t i = 0; i < graph.ops.size(); i++)
+        for (int i = (int)graph.ops.size() - 1; i >= 0; i--)
         {
             Operator* op = graph.ops[i];
 
-            bool is_inplace_op = op->type.size() > 2 && op->type[op->type.size() - 2] != '_' && op->type[op->type.size() - 1] == '_';
-            if (!is_inplace_op)
+            if (!is_alias_op(op))
                 continue;
 
-            // replace inplace op with non-inplace version
-            op->type = op->type.substr(0, op->type.size() - 1);
+            Operand* out0 = op->outputs[0];
 
-            if (op->type == "aten::copy")
+            if (out0->consumers.size() == 1)
                 continue;
 
-            if (op->outputs[0]->consumers.size() != 0)
+            for (int j = (int)out0->consumers.size() - 1; j > 0; j--)
+            {
+                Operator* op1 = out0->consumers[j];
+
+                Operator* op_shadow = graph.new_operator_after(op->type, op->name + "_pnnxshadow_" + std::to_string(j), op);
+
+                Operand* shadow_out = graph.new_operand(op_shadow->name + "_out");
+
+                op_shadow->inputs = op->inputs;
+                op_shadow->params = op->params;
+                op_shadow->outputs.push_back(shadow_out);
+
+                for (Operand* x : op->inputs)
+                {
+                    x->consumers.push_back(op_shadow);
+                }
+
+                shadow_out->producer = op_shadow;
+                shadow_out->type = out0->type;
+                shadow_out->shape = out0->shape;
+                shadow_out->params = out0->params;
+
+                shadow_out->consumers.push_back(op1);
+
+                for (size_t k = 0; k < op1->inputs.size(); k++)
+                {
+                    if (op1->inputs[k] == out0)
+                        op1->inputs[k] = shadow_out;
+                }
+            }
+
+            out0->consumers.resize(1);
+        }
+    }
+
+    // graph.save("1.param", "1.bin");
+
+    // 2. replace inplace op, append copy
+    // 3. tag operand alias for view/slice/select/... output
+    {
+        for (size_t i = 0; i < graph.ops.size(); i++)
+        {
+            Operator* op = graph.ops[i];
+
+            bool is_inplace_op = op->type.size() > 2 && op->type[op->type.size() - 2] != '_' && op->type[op->type.size() - 1] == '_';
+
+            if (op->type != "aten::copy_" && !is_alias_op(op) && !is_inplace_op)
                 continue;
 
-            matched = true;
+            Operand* in = op->inputs[0];
 
-            // find in0 from slice / select chain
-            Operand* in0 = op->inputs[0];
-            while (in0->producer->type == "aten::slice" || in0->producer->type == "aten::select")
+            int alias_index;
+            if (in->params.find("__alias__") != in->params.end())
             {
-                in0 = in0->producer->inputs[0];
+                alias_index = in->params.at("__alias__").i;
+            }
+            else
+            {
+                alias_index = std::find(graph.operands.begin(), graph.operands.end(), in) - graph.operands.begin();
             }
 
-            // append copy for inplace op
-            Operator* op_copy = graph.new_operator_after("aten::copy", op->name + "_copy", op);
-            Operand* copy_out = graph.new_operand(op->name + "_copy_out");
+            if (op->type == "aten::copy_")
+            {
+                op->outputs[0]->params["__alias__"] = alias_index;
+                // fprintf(stderr, "operand %s is alias of %s\n", op->outputs[0]->name.c_str(), graph.operands[alias_index]->name.c_str());
+
+                // set copy output shape as the alias one
+                op->outputs[0]->type = graph.operands[alias_index]->type;
+                op->outputs[0]->shape = graph.operands[alias_index]->shape;
+
+                continue;
+            }
 
-            copy_out->type = in0->type;
-            copy_out->shape = in0->shape;
+            if (is_alias_op(op))
+            {
+                op->outputs[0]->params["__alias__"] = alias_index;
+                // fprintf(stderr, "operand %s is alias of %s\n", op->outputs[0]->name.c_str(), graph.operands[alias_index]->name.c_str());
+                continue;
+            }
 
-            op_copy->inputs.push_back(op->inputs[0]);
-            op_copy->inputs.push_back(op->outputs[0]);
-            op->inputs[0]->consumers.push_back(op_copy);
-            op->outputs[0]->consumers.push_back(op_copy);
+            if (is_inplace_op)
+            {
+                // replace with non-inplace version, create copy op
+                op->type = op->type.substr(0, op->type.size() - 1);
 
-            op_copy->outputs.push_back(copy_out);
-            copy_out->producer = op_copy;
+                // append aten::copy_
+                if (graph.operands[alias_index]->consumers.size() > 1)
+                {
+                    Operand* in0 = op->inputs[0];
+                    Operand* out0 = op->outputs[0];
 
-            break;
-        }
+                    Operator* op_copy = graph.new_operator_after("aten::copy_", op->name + "_copy", op);
+                    Operand* copy_out = graph.new_operand(op->name + "_copy_out");
 
-        if (!matched)
-            break;
+                    op_copy->inputs.push_back(in0);
+                    op_copy->inputs.push_back(out0);
+                    in0->consumers.push_back(op_copy);
+                    out0->consumers.push_back(op_copy);
+
+                    op_copy->outputs.push_back(copy_out);
+                    copy_out->producer = op_copy;
+                }
+            }
+        }
     }
 
-    for (size_t i = 0; i < graph.ops.size(); i++)
+    // graph.save("3.param", "3.bin");
+
+    // 4. scan inplace copy op, collect affacted alias
     {
-        Operator* op = graph.ops[i];
+        for (size_t i = 0; i < graph.ops.size(); i++)
+        {
+            Operator* op = graph.ops[i];
 
-        if (op->type != "aten::copy")
-            continue;
+            if (op->type != "aten::copy_")
+                continue;
 
-        if (op->outputs[0]->consumers.size() != 0)
-            continue;
+            op->type = "aten::copy";
 
-        // aten::slice   5 1 in0 .... a
-        // aten::slice   5 1 a .... b
-        // aten::copy    2 1 b in1 out
+            Operand* out0 = op->outputs[0];
 
-        // aten::select  3 1 in0 .... a
-        // aten::copy    2 1 a in1 out
+            // inplace op output always alias with the input
+            const int alias_index = out0->params.at("__alias__").i;
+            Operand* alias_in0 = graph.operands[alias_index];
 
-        // find in0 from slice / select chain
-        Operand* in0 = op->inputs[0];
-        while (in0->producer->type == "aten::slice" || in0->producer->type == "aten::select")
-        {
-            in0 = in0->producer->inputs[0];
-        }
+            // fprintf(stderr, "\n---> %s  for %s\n", op->name.c_str(), alias_in0->name.c_str());
 
-        // replace all the following uses of in0 with out
-        Operand* out0 = op->outputs[0];
-        out0->shape = in0->shape;
-        for (size_t j = i; j < graph.ops.size(); j++)
-        {
-            Operator* op2 = graph.ops[j];
+            size_t i_advanced = 0;
 
-            bool use_in0 = false;
-            for (size_t k = 0; k < op2->inputs.size(); k++)
+            // 5. look for any op after the inplace op with alias input
+            for (size_t j = i + 1; j < graph.ops.size(); j++)
             {
-                if (op2->inputs[k] == in0)
+                Operator* op1 = graph.ops[j];
+
+                bool affacted = false;
+                for (Operand* x : op1->inputs)
                 {
-                    op2->inputs[k] = out0;
-                    use_in0 = true;
+                    if (x == alias_in0)
+                    {
+                        affacted = true;
+                        break;
+                    }
+
+                    if (x->params.find("__alias__") == x->params.end())
+                        continue;
+
+                    int alias_index_1 = x->params.at("__alias__").i;
+                    if (alias_index_1 == alias_index)
+                    {
+                        affacted = true;
+                        break;
+                    }
                 }
-            }
 
-            if (use_in0)
-            {
-                in0->remove_consumer(op2);
-                out0->consumers.push_back(op2);
+                if (!affacted)
+                    continue;
+
+                // 6. collect ops on the chain back to alias
+                std::set<size_t> chainsx_op_indexes;
+                {
+                    size_t op1_index = std::find(graph.ops.begin(), graph.ops.end(), op1) - graph.ops.begin();
+
+                    if (op1_index < i - i_advanced)
+                    {
+                        chainsx_op_indexes.insert(op1_index);
+                        // fprintf(stderr, "affacted op %s for %s\n", op1->name.c_str(), graph.operands[alias_index]->name.c_str());
+                    }
+
+                    while (1)
+                    {
+                        Operand* x = op1->inputs[0];
+                        if (x->params.find("__alias__") == x->params.end())
+                            break;
+
+                        int alias_index_1 = x->params.at("__alias__").i;
+                        if (alias_index_1 != alias_index)
+                            break;
+
+                        op1 = x->producer;
+                        size_t op1_index = std::find(graph.ops.begin(), graph.ops.end(), op1) - graph.ops.begin();
+
+                        if (op1_index < i - i_advanced)
+                        {
+                            chainsx_op_indexes.insert(op1_index);
+                            // fprintf(stderr, "affacted op %s for %s   chained\n", op1->name.c_str(), graph.operands[alias_index]->name.c_str());
+                        }
+                    }
+                }
+
+                // 7. move chain after copy op
+                {
+                    int k = 0;
+                    for (size_t doi : chainsx_op_indexes)
+                    {
+                        doi -= k;
+                        // fprintf(stderr, "---> move %s after %s\n", graph.ops[doi]->name.c_str(), graph.ops[i - i_advanced]->name.c_str());
+
+                        for (size_t l = doi; l < i - i_advanced; l++)
+                        {
+                            std::swap(graph.ops[l], graph.ops[l + 1]);
+                        }
+
+                        k += 1;
+                    }
+
+                    i_advanced += chainsx_op_indexes.size();
+                }
+
+                // 8. update all alias uses after copy op, retag alias
+                out0->params.erase("__alias__");
+                const int new_alias_index = std::find(graph.operands.begin(), graph.operands.end(), out0) - graph.operands.begin();
+                for (size_t k = i - i_advanced + 1; k < graph.ops.size(); k++)
+                {
+                    Operator* op2 = graph.ops[k];
+
+                    // bool use_in0 = false;
+                    for (size_t l = 0; l < op2->inputs.size(); l++)
+                    {
+                        if (op2->inputs[l] == alias_in0)
+                        {
+                            // fprintf(stderr, "---> replace %s input %s to %s\n", op2->name.c_str(), op2->inputs[l]->name.c_str(), out0->name.c_str());
+
+                            op2->inputs[l] = out0;
+                            alias_in0->remove_consumer(op2);
+                            out0->consumers.push_back(op2);
+                        }
+                    }
+
+                    for (Operand* x : op2->outputs)
+                    {
+                        if (x->params.find("__alias__") != x->params.end() && x->params.at("__alias__").i == alias_index)
+                        {
+                            x->params["__alias__"] = new_alias_index;
+                        }
+                    }
+                }
+
+                // rewind to the updated copy operator
+                j -= chainsx_op_indexes.size();
             }
         }
     }
+
+    // graph.save("4.param", "4.bin");
+
+    // 9. clear all alias tag
+    {
+        for (Operand* x : graph.operands)
+        {
+            x->params.erase("__alias__");
+        }
+    }
 }
 
 void pass_level2(Graph& g)
 {
-    fix_inplace_copy_output(g);
+    functionize(g);
 
     int opindex = 0;
     for (auto x : g_global_pnnx_graph_rewriter_passes)
diff --git a/tools/pnnx/src/pass_level2/F_avg_pool2d.cpp b/tools/pnnx/src/pass_level2/F_avg_pool2d.cpp
index 6c92b31808a..82b73c9c650 100644
--- a/tools/pnnx/src/pass_level2/F_avg_pool2d.cpp
+++ b/tools/pnnx/src/pass_level2/F_avg_pool2d.cpp
@@ -43,4 +43,46 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_avg_pool2d, 10)
 
+class F_avg_pool2d_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+AveragePool             op_0        1 1 input out kernel_shape=%kernel_shape strides=%strides pads=%pads ceil_mode=%ceil_mode count_include_pad=%count_include_pad auto_pad=*
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.avg_pool2d";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        std::vector<int> kernel_shape = captured_params.at("kernel_shape").ai;
+        std::vector<int> strides = captured_params.at("strides").ai;
+        std::vector<int> pads = captured_params.at("pads").ai;
+        int ceil_mode = captured_params.at("ceil_mode").i;
+        int count_include_pad = captured_params.at("count_include_pad").i;
+
+        if (pads.size() == 4)
+        {
+            pads = {pads[0], pads[1]};
+        }
+
+        op->params["kernel_size"] = kernel_shape;
+        op->params["stride"] = strides;
+        op->params["padding"] = pads;
+        op->params["ceil_mode"] = (ceil_mode != 0);
+        op->params["count_include_pad"] = (count_include_pad != 0);
+        op->params["divisor_override"] = Parameter();
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_avg_pool2d_1, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_conv2d.cpp b/tools/pnnx/src/pass_level2/F_conv2d.cpp
index 0fd4386d125..9d14cf03e4c 100644
--- a/tools/pnnx/src/pass_level2/F_conv2d.cpp
+++ b/tools/pnnx/src/pass_level2/F_conv2d.cpp
@@ -51,4 +51,114 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_conv2d, 10)
 
+class F_conv2d_cudnn_relu : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+9 8
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 weight
+pnnx.Input              input_2     0 1 bias
+pnnx.Input              input_3     0 1 stride
+pnnx.Input              input_4     0 1 padding
+pnnx.Input              input_5     0 1 dilation
+pnnx.Input              input_6     0 1 groups
+aten::cudnn_convolution_relu op_0   7 1 input weight bias stride padding dilation groups out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+10 9
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 weight
+pnnx.Input              input_2     0 1 bias
+pnnx.Input              input_3     0 1 stride
+pnnx.Input              input_4     0 1 padding
+pnnx.Input              input_5     0 1 dilation
+pnnx.Input              input_6     0 1 groups
+F.conv2d                conv        7 1 input weight bias stride padding dilation groups conv
+F.relu                  relu        1 1 conv out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_conv2d_cudnn_relu, 10)
+
+class F_conv2d_cudnn_add_relu : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+11 10
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 weight
+pnnx.Input              input_2     0 1 z
+pnnx.Input              input_3     0 1 alpha
+pnnx.Input              input_4     0 1 bias
+pnnx.Input              input_5     0 1 stride
+pnnx.Input              input_6     0 1 padding
+pnnx.Input              input_7     0 1 dilation
+pnnx.Input              input_8     0 1 groups
+aten::cudnn_convolution_add_relu op_0 9 1 input weight z alpha bias stride padding dilation groups out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+13 12
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 weight
+pnnx.Input              input_2     0 1 z
+pnnx.Input              input_3     0 1 alpha
+pnnx.Input              input_4     0 1 bias
+pnnx.Input              input_5     0 1 stride
+pnnx.Input              input_6     0 1 padding
+pnnx.Input              input_7     0 1 dilation
+pnnx.Input              input_8     0 1 groups
+F.conv2d                conv        7 1 input weight bias stride padding dilation groups conv
+aten::add               add         3 1 conv z alpha add
+F.relu                  relu        1 1 add out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_conv2d_cudnn_add_relu, 10)
+
+class F_conv2d_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+9 8
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 weight
+pnnx.Input              input_2     0 1 bias
+pnnx.Input              input_3     0 1 stride
+pnnx.Input              input_4     0 1 padding
+pnnx.Input              input_5     0 1 dilation
+pnnx.Input              input_6     0 1 groups
+aten::conv2d            op_0        7 1 input weight bias stride padding dilation groups out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.conv2d";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_conv2d_1, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_conv3d.cpp b/tools/pnnx/src/pass_level2/F_conv3d.cpp
index 6f2341ba58c..742417081f4 100644
--- a/tools/pnnx/src/pass_level2/F_conv3d.cpp
+++ b/tools/pnnx/src/pass_level2/F_conv3d.cpp
@@ -52,4 +52,50 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_conv3d, 10)
 
+class F_conv3d_0 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 weight
+pnnx.Input              input_2     0 1 bias
+prim::Constant          op_0        0 1 transposed value=False
+aten::convolution_onnx  op_1        4 1 input weight bias transposed out dilations=%dilations groups=%groups output_padding=(0,0,0) pads=%pads strides=%strides
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.conv3d";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& dilations = captured_params.at("dilations").ai;
+        const std::vector<int>& strides = captured_params.at("strides").ai;
+        const std::vector<int>& pads = captured_params.at("pads").ai;
+        return dilations.size() == 3 && strides.size() == 3 && pads.size() == 6;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        std::vector<int> pads = captured_params.at("pads").ai;
+        if (pads.size() == 6)
+        {
+            pads = {pads[0], pads[1], pads[2]};
+        }
+
+        op->params["dilation"] = captured_params.at("dilations");
+        op->params["stride"] = captured_params.at("strides");
+        op->params["padding"] = pads;
+        op->params["groups"] = captured_params.at("groups");
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_conv3d_0, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_layer_norm.cpp b/tools/pnnx/src/pass_level2/F_layer_norm.cpp
index ff914e2ca3f..e577cf97a3a 100644
--- a/tools/pnnx/src/pass_level2/F_layer_norm.cpp
+++ b/tools/pnnx/src/pass_level2/F_layer_norm.cpp
@@ -42,4 +42,55 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_layer_norm, 10)
 
+class F_layer_norm_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 6
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 weight
+pnnx.Input              input_2     0 1 bias
+LayerNormalization      op_0        3 3 input weight bias out Mean InvStdDev axis=%axis epsilon=%epsilon stash_type=%stash_type
+pnnx.Output             output      3 0 out Mean InvStdDev
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.layer_norm";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const int input_rank = op->inputs[0]->shape.size();
+
+        int axis = captured_params.at("axis").i;
+        if (axis < 0)
+        {
+            axis = input_rank + axis;
+        }
+
+        std::vector<int> normalized_shape;
+        for (int i = axis; i < input_rank; i++)
+        {
+            normalized_shape.push_back(op->inputs[0]->shape[i]);
+        }
+
+        op->params["normalized_shape"] = normalized_shape;
+        op->params["eps"] = captured_params.at("epsilon");
+
+        // drop Mean and InvStdDev if not used
+        if (op->outputs[1]->consumers.empty() && op->outputs[2]->consumers.empty())
+        {
+            op->outputs[1]->producer = 0;
+            op->outputs[2]->producer = 0;
+            op->outputs.resize(1);
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_layer_norm_1, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_log_softmax.cpp b/tools/pnnx/src/pass_level2/F_log_softmax.cpp
index dd44a1c06c8..0264973783b 100644
--- a/tools/pnnx/src/pass_level2/F_log_softmax.cpp
+++ b/tools/pnnx/src/pass_level2/F_log_softmax.cpp
@@ -25,7 +25,7 @@ class F_log_softmax : public GraphRewriterPass
 5 4
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 dim
-prim::Constant          op_0        0 1 dtype value=None
+prim::Constant          op_0        0 1 dtype value=*
 aten::log_softmax       op_1        3 1 input dim dtype out
 pnnx.Output             output      1 0 out
 )PNNXIR";
diff --git a/tools/pnnx/src/pass_level2/F_max_pool2d.cpp b/tools/pnnx/src/pass_level2/F_max_pool2d.cpp
index 4b08f577a8b..4b8e0580c31 100644
--- a/tools/pnnx/src/pass_level2/F_max_pool2d.cpp
+++ b/tools/pnnx/src/pass_level2/F_max_pool2d.cpp
@@ -80,4 +80,46 @@ pnnx.Output             output      2 0 out indices
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_max_pool2d_2, 10)
 
+class F_max_pool2d_3 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 3
+pnnx.Input              input_0     0 1 input
+aten::max_pool_with_indices_onnx op_1 1 2 input out indices kernel_size=%kernel_size stride=%stride padding=%padding dilation=%dilation ceil_mode=%ceil_mode n_dims_axes=* n_dims_one=* n_dims_zero=* unbatched_rank=*
+pnnx.Output             output      2 0 out indices
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.max_pool2d";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        std::vector<int> kernel_size = captured_params.at("kernel_size").ai;
+        std::vector<int> dilation = captured_params.at("dilation").ai;
+        std::vector<int> stride = captured_params.at("stride").ai;
+        std::vector<int> padding = captured_params.at("padding").ai;
+        int ceil_mode = captured_params.at("ceil_mode").i;
+
+        if (padding.size() == 4)
+        {
+            padding = {padding[0], padding[1]};
+        }
+
+        op->params["kernel_size"] = kernel_size;
+        op->params["dilation"] = dilation;
+        op->params["stride"] = stride;
+        op->params["padding"] = padding;
+        op->params["ceil_mode"] = (ceil_mode != 0);
+        op->params["return_indices"] = (op->outputs.size() != 1);
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_max_pool2d_3, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_pad.cpp b/tools/pnnx/src/pass_level2/F_pad.cpp
index 398bf43f81b..c4007d25f9b 100644
--- a/tools/pnnx/src/pass_level2/F_pad.cpp
+++ b/tools/pnnx/src/pass_level2/F_pad.cpp
@@ -44,6 +44,33 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_pad, 10)
 
+class F_pad_01 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 pad
+aten::constant_pad_nd   op_0        2 1 input pad out value=%value
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.pad";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& /*captured_params*/) const
+    {
+        op->params["mode"] = "constant";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_pad_01, 10)
+
 class F_pad_1 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_level2/F_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_level2/F_scaled_dot_product_attention.cpp
index ecfc6d8acaf..36ca3c334f2 100644
--- a/tools/pnnx/src/pass_level2/F_scaled_dot_product_attention.cpp
+++ b/tools/pnnx/src/pass_level2/F_scaled_dot_product_attention.cpp
@@ -72,7 +72,7 @@ pnnx.Output             output      1 0 out
 
         if (captured_params.at("scale").type == 0)
         {
-            // drop scale=None for compatiblity with old torch
+            // drop scale=None for compatibility with old torch
             op->params.erase("scale");
         }
     }
diff --git a/tools/pnnx/src/pass_level2/F_softmax.cpp b/tools/pnnx/src/pass_level2/F_softmax.cpp
index 8a9352beba2..8d8068a43c8 100644
--- a/tools/pnnx/src/pass_level2/F_softmax.cpp
+++ b/tools/pnnx/src/pass_level2/F_softmax.cpp
@@ -39,4 +39,25 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softmax, 10)
 
+class F_softmax_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+aten::softmax_no_dtype  op_0        1 1 input out dim=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softmax";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softmax_1, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_softmin.cpp b/tools/pnnx/src/pass_level2/F_softmin.cpp
index b4106a3fcea..bb0768663c5 100644
--- a/tools/pnnx/src/pass_level2/F_softmin.cpp
+++ b/tools/pnnx/src/pass_level2/F_softmin.cpp
@@ -26,7 +26,7 @@ class F_softmin : public GraphRewriterPass
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 dim
 aten::neg               op_0        1 1 input 6
-prim::Constant          op_1        0 1 dtype value=None
+prim::Constant          op_1        0 1 dtype value=*
 aten::softmax           op_2        3 1 6 dim dtype out
 pnnx.Output             output      1 0 out
 )PNNXIR";
diff --git a/tools/pnnx/src/pass_level2/Tensor_copy.cpp b/tools/pnnx/src/pass_level2/Tensor_copy.cpp
index d5369b29e8a..1baa14b6ce2 100644
--- a/tools/pnnx/src/pass_level2/Tensor_copy.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_copy.cpp
@@ -39,6 +39,28 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_copy, 20)
 
+class Tensor_copy_01 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 self
+pnnx.Input              input_1     0 1 src
+aten::copy              op_1        2 1 self src out non_blocking=*
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Tensor.copy";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_copy_01, 20)
+
 class Tensor_copy_1 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_level2/Tensor_expand.cpp b/tools/pnnx/src/pass_level2/Tensor_expand.cpp
index 9d860c8319f..23c1af6a863 100644
--- a/tools/pnnx/src/pass_level2/Tensor_expand.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_expand.cpp
@@ -39,4 +39,26 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_expand, 20)
 
+class Tensor_expand_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 shape
+aten::expand            op_1        2 1 input shape out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Tensor.expand";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_expand_1, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/Tensor_new_empty.cpp b/tools/pnnx/src/pass_level2/Tensor_new_empty.cpp
index 7cddaa48698..215b17e2c13 100644
--- a/tools/pnnx/src/pass_level2/Tensor_new_empty.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_new_empty.cpp
@@ -25,7 +25,7 @@ class Tensor_new_empty : public GraphRewriterPass
 8 7
 pnnx.Input              input_0             0 1 input
 pnnx.Input              input_1             0 1 size
-prim::Constant          op_0                0 1 dtype value=*
+prim::Constant          op_0                0 1 dtype value=%dtype
 prim::Constant          op_1                0 1 layout value=*
 prim::Constant          op_2                0 1 device value=*
 prim::Constant          op_3                0 1 pin_memory value=*
@@ -38,6 +38,29 @@ pnnx.Output             output              1 0 out
     {
         return "Tensor.new_empty";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_new_empty, 10)
diff --git a/tools/pnnx/src/pass_level2/Tensor_new_ones.cpp b/tools/pnnx/src/pass_level2/Tensor_new_ones.cpp
index d9e78ab48cc..3fe4c3390be 100644
--- a/tools/pnnx/src/pass_level2/Tensor_new_ones.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_new_ones.cpp
@@ -25,7 +25,7 @@ class Tensor_new_ones : public GraphRewriterPass
 8 7
 pnnx.Input              input_0             0 1 input
 pnnx.Input              input_1             0 1 size
-prim::Constant          op_0                0 1 dtype value=*
+prim::Constant          op_0                0 1 dtype value=%dtype
 prim::Constant          op_1                0 1 layout value=*
 prim::Constant          op_2                0 1 device value=*
 prim::Constant          op_3                0 1 pin_memory value=*
@@ -38,6 +38,29 @@ pnnx.Output             output              1 0 out
     {
         return "Tensor.new_ones";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_new_ones, 10)
diff --git a/tools/pnnx/src/pass_level2/Tensor_new_zeros.cpp b/tools/pnnx/src/pass_level2/Tensor_new_zeros.cpp
index 37a9df3e49d..93963f2a266 100644
--- a/tools/pnnx/src/pass_level2/Tensor_new_zeros.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_new_zeros.cpp
@@ -25,7 +25,7 @@ class Tensor_new_zeros : public GraphRewriterPass
 8 7
 pnnx.Input              input_0             0 1 input
 pnnx.Input              input_1             0 1 size
-prim::Constant          op_0                0 1 dtype value=*
+prim::Constant          op_0                0 1 dtype value=%dtype
 prim::Constant          op_1                0 1 layout value=*
 prim::Constant          op_2                0 1 device value=*
 prim::Constant          op_3                0 1 pin_memory value=*
@@ -38,6 +38,29 @@ pnnx.Output             output              1 0 out
     {
         return "Tensor.new_zeros";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_new_zeros, 10)
diff --git a/tools/pnnx/src/pass_level2/Tensor_select.cpp b/tools/pnnx/src/pass_level2/Tensor_select.cpp
index 3ab8a147bb0..07760fcfc99 100644
--- a/tools/pnnx/src/pass_level2/Tensor_select.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_select.cpp
@@ -39,4 +39,25 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_select, 20)
 
+class Tensor_select_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+aten::select            op_0        1 1 input out dim=%dim index=%index
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Tensor.select";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_select_1, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/Tensor_to.cpp b/tools/pnnx/src/pass_level2/Tensor_to.cpp
index 8ab1f124960..6d7cd9e7dc6 100644
--- a/tools/pnnx/src/pass_level2/Tensor_to.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_to.cpp
@@ -40,18 +40,25 @@ pnnx.Output             output      1 0 out
 
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
-        if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
-        if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
-        if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
-        if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
-        if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
-        if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
-        if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
-        if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
-        if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
-        if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
-        if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
-        if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
 
         op->params["copy"] = captured_params.at("copy");
 
diff --git a/tools/pnnx/src/pass_level2/torch_addmm.cpp b/tools/pnnx/src/pass_level2/torch_addmm.cpp
index c8e14a713b0..b15402462bc 100644
--- a/tools/pnnx/src/pass_level2/torch_addmm.cpp
+++ b/tools/pnnx/src/pass_level2/torch_addmm.cpp
@@ -41,4 +41,27 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_addmm, 20)
 
+class torch_addmm_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 mat1
+pnnx.Input              input_2     0 1 mat2
+aten::addmm             op_0        3 1 input mat1 mat2 out beta=%beta alpha=%alpha
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.addmm";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_addmm_1, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_arange.cpp b/tools/pnnx/src/pass_level2/torch_arange.cpp
index 7b0ea823753..548b24b28a3 100644
--- a/tools/pnnx/src/pass_level2/torch_arange.cpp
+++ b/tools/pnnx/src/pass_level2/torch_arange.cpp
@@ -24,7 +24,7 @@ class torch_arange : public GraphRewriterPass
         return R"PNNXIR(7767517
 7 6
 pnnx.Input              input_0     0 1 end
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -37,11 +37,34 @@ pnnx.Output             output      1 0 out
     {
         return "torch.arange";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_arange, 20)
 
-class torch_arange_1 : public GraphRewriterPass
+class torch_arange_1 : public torch_arange
 {
 public:
     const char* match_pattern_graph() const
@@ -51,7 +74,7 @@ class torch_arange_1 : public GraphRewriterPass
 pnnx.Input              input_0     0 1 start
 pnnx.Input              input_1     0 1 end
 pnnx.Input              input_2     0 1 step
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -59,16 +82,11 @@ aten::arange            op_4        7 1 start end step dtype layout device requi
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
-
-    const char* type_str() const
-    {
-        return "torch.arange";
-    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_arange_1, 20)
 
-class torch_arange_2 : public GraphRewriterPass
+class torch_arange_2 : public torch_arange
 {
 public:
     const char* match_pattern_graph() const
@@ -77,7 +95,7 @@ class torch_arange_2 : public GraphRewriterPass
 8 7
 pnnx.Input              input_0     0 1 start
 pnnx.Input              input_1     0 1 end
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 pin_memory value=*
@@ -85,16 +103,11 @@ aten::arange            op_4        6 1 start end dtype layout device pin_memory
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
-
-    const char* type_str() const
-    {
-        return "torch.arange";
-    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_arange_2, 20)
 
-class torch_arange_3 : public GraphRewriterPass
+class torch_arange_3 : public torch_arange
 {
 public:
     const char* match_pattern_graph() const
@@ -103,18 +116,13 @@ class torch_arange_3 : public GraphRewriterPass
 7 6
 pnnx.Input              input_0     0 1 start
 pnnx.Input              input_1     0 1 end
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 pin_memory value=*
 aten::arange            op_3        6 1 start end dtype layout layout pin_memory out
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
-
-    const char* type_str() const
-    {
-        return "torch.arange";
-    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_arange_3, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_cat.cpp b/tools/pnnx/src/pass_level2/torch_cat.cpp
index b4d3b5e87d6..2dcbfa4e084 100644
--- a/tools/pnnx/src/pass_level2/torch_cat.cpp
+++ b/tools/pnnx/src/pass_level2/torch_cat.cpp
@@ -38,4 +38,25 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_cat, 20)
 
+class torch_cat_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 tensors
+aten::cat               op_0        1 1 tensors out dim=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.cat";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_cat_1, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_clone.cpp b/tools/pnnx/src/pass_level2/torch_clone.cpp
index e645c3cfaf0..82a9d82bb5d 100644
--- a/tools/pnnx/src/pass_level2/torch_clone.cpp
+++ b/tools/pnnx/src/pass_level2/torch_clone.cpp
@@ -48,4 +48,42 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_clone, 20)
 
+class torch_clone_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+aten::clone             op_1        1 1 input out memory_format=%memory_format
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.clone";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("memory_format").type == 4 && captured_params.at("memory_format").s.empty())
+        {
+            op->params["memory_format"] = "torch.contiguous_format";
+        }
+        else
+        {
+            if (captured_params.at("memory_format").i == 0)
+                op->params["memory_format"] = "torch.contiguous_format";
+            if (captured_params.at("memory_format").i == 1)
+                op->params["memory_format"] = "torch.preserve_format";
+            if (captured_params.at("memory_format").i == 2)
+                op->params["memory_format"] = "torch.channels_last";
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_clone_1, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_empty.cpp b/tools/pnnx/src/pass_level2/torch_empty.cpp
index 288c968fc5d..3c6a074cbd0 100644
--- a/tools/pnnx/src/pass_level2/torch_empty.cpp
+++ b/tools/pnnx/src/pass_level2/torch_empty.cpp
@@ -24,7 +24,7 @@ class torch_empty : public GraphRewriterPass
         return R"PNNXIR(7767517
 8 7
 pnnx.Input              input_0     0 1 size
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -38,6 +38,29 @@ pnnx.Output             output      1 0 out
     {
         return "torch.empty";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_empty, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_empty_like.cpp b/tools/pnnx/src/pass_level2/torch_empty_like.cpp
index a0324b0912d..baa2f74c0cf 100644
--- a/tools/pnnx/src/pass_level2/torch_empty_like.cpp
+++ b/tools/pnnx/src/pass_level2/torch_empty_like.cpp
@@ -24,7 +24,7 @@ class torch_empty_like : public GraphRewriterPass
         return R"PNNXIR(7767517
 8 7
 pnnx.Input              input_0     0 1 input
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -38,6 +38,29 @@ pnnx.Output             output      1 0 out
     {
         return "torch.empty_like";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_empty_like, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_full.cpp b/tools/pnnx/src/pass_level2/torch_full.cpp
index 85c9901e4da..718a0796a53 100644
--- a/tools/pnnx/src/pass_level2/torch_full.cpp
+++ b/tools/pnnx/src/pass_level2/torch_full.cpp
@@ -25,7 +25,7 @@ class torch_full : public GraphRewriterPass
 8 7
 pnnx.Input              input_0     0 1 size
 pnnx.Input              input_1     0 1 fill_value
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -38,6 +38,29 @@ pnnx.Output             output      1 0 out
     {
         return "torch.full";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_full, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_full_like.cpp b/tools/pnnx/src/pass_level2/torch_full_like.cpp
index 4af611e227d..4d58df9c7c7 100644
--- a/tools/pnnx/src/pass_level2/torch_full_like.cpp
+++ b/tools/pnnx/src/pass_level2/torch_full_like.cpp
@@ -25,7 +25,7 @@ class torch_full_like : public GraphRewriterPass
 9 8
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 fill_value
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -39,6 +39,29 @@ pnnx.Output             output      1 0 out
     {
         return "torch.full_like";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_full_like, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_masked_select.cpp b/tools/pnnx/src/pass_level2/torch_masked_select.cpp
new file mode 100644
index 00000000000..d649dd6d7fa
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/torch_masked_select.cpp
@@ -0,0 +1,41 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class torch_masked_select : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 mask
+aten::masked_select     op_0        2 1 input mask out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.masked_select";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_masked_select, 20)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_mean.cpp b/tools/pnnx/src/pass_level2/torch_mean.cpp
index a43682a8fc6..9fbca3d53c9 100644
--- a/tools/pnnx/src/pass_level2/torch_mean.cpp
+++ b/tools/pnnx/src/pass_level2/torch_mean.cpp
@@ -40,6 +40,43 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_mean, 20)
 
+class torch_mean_01 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 dim
+aten::mean_dim          op_0        2 1 input dim out keepdim=%keepdim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.mean";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        bool keepdim;
+        if (captured_params.at("keepdim").type == 2)
+        {
+            keepdim = captured_params.at("keepdim").i ? true : false;
+        }
+        else // if (captured_params.at("keepdim").type == 1)
+        {
+            keepdim = captured_params.at("keepdim").b;
+        }
+
+        op->params["keepdim"] = keepdim;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_mean_01, 20)
+
 class torch_mean_1 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_level2/torch_ones.cpp b/tools/pnnx/src/pass_level2/torch_ones.cpp
index f14530ea9b8..888397a97c5 100644
--- a/tools/pnnx/src/pass_level2/torch_ones.cpp
+++ b/tools/pnnx/src/pass_level2/torch_ones.cpp
@@ -24,7 +24,7 @@ class torch_ones : public GraphRewriterPass
         return R"PNNXIR(7767517
 7 6
 pnnx.Input              input_0     0 1 size
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -37,6 +37,29 @@ pnnx.Output             output      1 0 out
     {
         return "torch.ones";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_ones, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_ones_like.cpp b/tools/pnnx/src/pass_level2/torch_ones_like.cpp
index 2b33fa4a19f..8837b0fdd5f 100644
--- a/tools/pnnx/src/pass_level2/torch_ones_like.cpp
+++ b/tools/pnnx/src/pass_level2/torch_ones_like.cpp
@@ -24,7 +24,7 @@ class torch_ones_like : public GraphRewriterPass
         return R"PNNXIR(7767517
 8 7
 pnnx.Input              input_0     0 1 input
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -38,6 +38,29 @@ pnnx.Output             output      1 0 out
     {
         return "torch.ones_like";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_ones_like, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_permute.cpp b/tools/pnnx/src/pass_level2/torch_permute.cpp
index fcf536bf376..37e4e3c2441 100644
--- a/tools/pnnx/src/pass_level2/torch_permute.cpp
+++ b/tools/pnnx/src/pass_level2/torch_permute.cpp
@@ -14,7 +14,7 @@
 
 #include "pass_level2.h"
 
-#include <torch/csrc/api/include/torch/torch.h>
+#include <torch/csrc/api/include/torch/version.h>
 
 namespace pnnx {
 
@@ -44,4 +44,29 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_permute, 20)
 
+class torch_permute_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+aten::permute           op_0        1 1 input out dims=%dims
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+#if TORCH_VERSION_MAJOR >= 2 || TORCH_VERSION_MAJOR == 1 && TORCH_VERSION_MINOR >= 9
+        return "torch.permute";
+#else
+        return "Tensor.permute";
+#endif
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_permute_1, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_randn.cpp b/tools/pnnx/src/pass_level2/torch_randn.cpp
index 18cc83d04ab..345c4e495d5 100644
--- a/tools/pnnx/src/pass_level2/torch_randn.cpp
+++ b/tools/pnnx/src/pass_level2/torch_randn.cpp
@@ -24,7 +24,7 @@ class torch_randn : public GraphRewriterPass
         return R"PNNXIR(7767517
 7 6
 pnnx.Input              input_0     0 1 size
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -37,6 +37,29 @@ pnnx.Output             output      1 0 out
     {
         return "torch.randn";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_randn, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_randn_like.cpp b/tools/pnnx/src/pass_level2/torch_randn_like.cpp
index 51ad8524e60..68c1dc9dcb6 100644
--- a/tools/pnnx/src/pass_level2/torch_randn_like.cpp
+++ b/tools/pnnx/src/pass_level2/torch_randn_like.cpp
@@ -24,7 +24,7 @@ class torch_randn_like : public GraphRewriterPass
         return R"PNNXIR(7767517
 8 7
 pnnx.Input              input_0     0 1 input
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -38,6 +38,29 @@ pnnx.Output             output      1 0 out
     {
         return "torch.randn_like";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_randn_like, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_repeat_interleave.cpp b/tools/pnnx/src/pass_level2/torch_repeat_interleave.cpp
index 0552e19a8b7..0f54422529a 100644
--- a/tools/pnnx/src/pass_level2/torch_repeat_interleave.cpp
+++ b/tools/pnnx/src/pass_level2/torch_repeat_interleave.cpp
@@ -14,8 +14,6 @@
 
 #include "pass_level2.h"
 
-#include <torch/csrc/api/include/torch/torch.h>
-
 namespace pnnx {
 
 class torch_repeat_interleave : public GraphRewriterPass
diff --git a/tools/pnnx/src/pass_level2/torch_roll.cpp b/tools/pnnx/src/pass_level2/torch_roll.cpp
index 238e4915bf7..c71f7f9395d 100644
--- a/tools/pnnx/src/pass_level2/torch_roll.cpp
+++ b/tools/pnnx/src/pass_level2/torch_roll.cpp
@@ -39,4 +39,26 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_roll, 20)
 
+class torch_roll_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 shifts
+aten::roll_shift_and_dim_onnx op_0  2 1 input shifts out dim=%dims
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.roll";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_roll_1, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_slice_scatter.cpp b/tools/pnnx/src/pass_level2/torch_slice_scatter.cpp
new file mode 100644
index 00000000000..32316609202
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/torch_slice_scatter.cpp
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class torch_slice_scatter : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+8 7
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 src
+pnnx.Input              input_2     0 1 dim
+pnnx.Input              input_3     0 1 start
+pnnx.Input              input_4     0 1 end
+pnnx.Input              input_5     0 1 step
+aten::slice_scatter     op_0        6 1 input src dim start end step out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.slice_scatter";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_slice_scatter, 20)
+
+class torch_slice_scatter_0 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+7 6
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 src
+pnnx.Input              input_2     0 1 start
+pnnx.Input              input_3     0 1 end
+pnnx.Input              input_4     0 1 step
+aten::slice_scatter     op_0        5 1 input src start end step out dim=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.slice_scatter";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_slice_scatter_0, 20)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_split.cpp b/tools/pnnx/src/pass_level2/torch_split.cpp
index 565fd3bcf99..0a87e3d57b4 100644
--- a/tools/pnnx/src/pass_level2/torch_split.cpp
+++ b/tools/pnnx/src/pass_level2/torch_split.cpp
@@ -39,6 +39,28 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_split, 20)
 
+class torch_split_01 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 tensor
+pnnx.Input              input_1     0 1 split_size_or_sections
+aten::split             op_0        2 1 tensor split_size_or_sections out dim=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.split";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_split_01, 20)
+
 class torch_split_1 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_level2/torch_squeeze.cpp b/tools/pnnx/src/pass_level2/torch_squeeze.cpp
index 95289b6ff80..4300b3ef63d 100644
--- a/tools/pnnx/src/pass_level2/torch_squeeze.cpp
+++ b/tools/pnnx/src/pass_level2/torch_squeeze.cpp
@@ -38,6 +38,27 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_squeeze, 20)
 
+class torch_squeeze_01 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+aten::squeeze_dim       op_0        1 1 input out dim=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.squeeze";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_squeeze_01, 20)
+
 class torch_squeeze_0 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_level2/torch_unbind.cpp b/tools/pnnx/src/pass_level2/torch_unbind.cpp
index c973b904b93..396f874fc3f 100644
--- a/tools/pnnx/src/pass_level2/torch_unbind.cpp
+++ b/tools/pnnx/src/pass_level2/torch_unbind.cpp
@@ -38,4 +38,25 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_unbind, 20)
 
+class torch_unbind_0 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+aten::unbind            op_0        1 1 input out dim=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.unbind";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_unbind_0, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_unsqueeze.cpp b/tools/pnnx/src/pass_level2/torch_unsqueeze.cpp
index 9acffa1d041..c7f2d8ad467 100644
--- a/tools/pnnx/src/pass_level2/torch_unsqueeze.cpp
+++ b/tools/pnnx/src/pass_level2/torch_unsqueeze.cpp
@@ -38,4 +38,25 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_unsqueeze, 20)
 
+class torch_unsqueeze_01 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+aten::unsqueeze         op_0        1 1 input out dim=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.unsqueeze";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_unsqueeze_01, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_zeros.cpp b/tools/pnnx/src/pass_level2/torch_zeros.cpp
index 7a2ce0e90a4..90213fdde5b 100644
--- a/tools/pnnx/src/pass_level2/torch_zeros.cpp
+++ b/tools/pnnx/src/pass_level2/torch_zeros.cpp
@@ -24,7 +24,7 @@ class torch_zeros : public GraphRewriterPass
         return R"PNNXIR(7767517
 7 6
 pnnx.Input              input_0     0 1 size
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -37,6 +37,29 @@ pnnx.Output             output      1 0 out
     {
         return "torch.zeros";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_zeros, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_zeros_like.cpp b/tools/pnnx/src/pass_level2/torch_zeros_like.cpp
index 9539c2fdf2c..5babbbb55a7 100644
--- a/tools/pnnx/src/pass_level2/torch_zeros_like.cpp
+++ b/tools/pnnx/src/pass_level2/torch_zeros_like.cpp
@@ -24,7 +24,7 @@ class torch_zeros_like : public GraphRewriterPass
         return R"PNNXIR(7767517
 8 7
 pnnx.Input              input_0     0 1 input
-prim::Constant          op_0        0 1 dtype value=*
+prim::Constant          op_0        0 1 dtype value=%dtype
 prim::Constant          op_1        0 1 layout value=*
 prim::Constant          op_2        0 1 device value=*
 prim::Constant          op_3        0 1 requires_grad value=*
@@ -38,6 +38,29 @@ pnnx.Output             output      1 0 out
     {
         return "torch.zeros_like";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dtype").type == 0)
+        {
+            op->params["dtype"] = Parameter();
+        }
+        else // if (captured_params.at("dtype").type == 2)
+        {
+            if (captured_params.at("dtype").i == 0) op->params["dtype"] = "torch.uint8";
+            if (captured_params.at("dtype").i == 1) op->params["dtype"] = "torch.int8";
+            if (captured_params.at("dtype").i == 2) op->params["dtype"] = "torch.short";
+            if (captured_params.at("dtype").i == 3) op->params["dtype"] = "torch.int";
+            if (captured_params.at("dtype").i == 4) op->params["dtype"] = "torch.long";
+            if (captured_params.at("dtype").i == 5) op->params["dtype"] = "torch.half";
+            if (captured_params.at("dtype").i == 6) op->params["dtype"] = "torch.float";
+            if (captured_params.at("dtype").i == 7) op->params["dtype"] = "torch.double";
+            if (captured_params.at("dtype").i == 8) op->params["dtype"] = "torch.complex32";
+            if (captured_params.at("dtype").i == 9) op->params["dtype"] = "torch.complex64";
+            if (captured_params.at("dtype").i == 10) op->params["dtype"] = "torch.complex128";
+            if (captured_params.at("dtype").i == 11) op->params["dtype"] = "torch.bool";
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_zeros_like, 20)
diff --git a/tools/pnnx/src/pass_level3/eliminate_noop_math.cpp b/tools/pnnx/src/pass_level3/eliminate_noop_math.cpp
index 40e583dd24b..5822490ad10 100644
--- a/tools/pnnx/src/pass_level3/eliminate_noop_math.cpp
+++ b/tools/pnnx/src/pass_level3/eliminate_noop_math.cpp
@@ -166,62 +166,107 @@ void eliminate_noop_math(Graph& graph)
             {
                 Operator* op0 = op->inputs[0]->producer;
                 Operator* op1 = op->inputs[1]->producer;
-                Operator* op2 = op->inputs[2]->producer;
 
-                if (operator_is_all_constant(op1, 0.f, 0))
+                if (op->inputs.size() == 2)
                 {
-                    // x <= a + 0 * c
-                    need_eliminate = true;
-                    identity_input_id = 0;
+                    if (operator_is_all_constant(op0, 0.f, 0))
+                    {
+                        // x <= 0 + b
+                        need_eliminate = true;
+                        identity_input_id = 1;
+                    }
+                    else if (operator_is_all_constant(op1, 0.f, 0))
+                    {
+                        // x <= a + 0
+                        need_eliminate = true;
+                        identity_input_id = 0;
+                    }
                 }
-                else if (operator_is_all_constant(op2, 0.f, 0))
+                else // if (op->inputs.size() == 3)
                 {
-                    // x <= a + b * 0
-                    need_eliminate = true;
-                    identity_input_id = 0;
-                }
-                else if (operator_is_all_constant(op0, 0.f, 0) && operator_is_all_constant(op2, 1.f, 1))
-                {
-                    // x <= 0 + b * 1
-                    need_eliminate = true;
-                    identity_input_id = 1;
+                    Operator* op2 = op->inputs[2]->producer;
+
+                    if (operator_is_all_constant(op1, 0.f, 0))
+                    {
+                        // x <= a + 0 * c
+                        need_eliminate = true;
+                        identity_input_id = 0;
+                    }
+                    else if (operator_is_all_constant(op2, 0.f, 0))
+                    {
+                        // x <= a + b * 0
+                        need_eliminate = true;
+                        identity_input_id = 0;
+                    }
+                    else if (operator_is_all_constant(op0, 0.f, 0) && operator_is_all_constant(op2, 1.f, 1))
+                    {
+                        // x <= 0 + b * 1
+                        need_eliminate = true;
+                        identity_input_id = 1;
+                    }
                 }
             }
             if (op->type == "aten::sub")
             {
                 Operator* op1 = op->inputs[1]->producer;
-                Operator* op2 = op->inputs[2]->producer;
 
-                if (operator_is_all_constant(op1, 0.f, 0))
+                if (op->inputs.size() == 2)
                 {
-                    // x <= a - 0 * c
-                    need_eliminate = true;
-                    identity_input_id = 0;
+                    if (operator_is_all_constant(op1, 0.f, 0))
+                    {
+                        // x <= a - 0
+                        need_eliminate = true;
+                        identity_input_id = 0;
+                    }
                 }
-                else if (operator_is_all_constant(op2, 0.f, 0))
+                else // if (op->inputs.size() == 3)
                 {
-                    // x <= a - b * 0
-                    need_eliminate = true;
-                    identity_input_id = 0;
+                    Operator* op2 = op->inputs[2]->producer;
+
+                    if (operator_is_all_constant(op1, 0.f, 0))
+                    {
+                        // x <= a - 0 * c
+                        need_eliminate = true;
+                        identity_input_id = 0;
+                    }
+                    else if (operator_is_all_constant(op2, 0.f, 0))
+                    {
+                        // x <= a - b * 0
+                        need_eliminate = true;
+                        identity_input_id = 0;
+                    }
                 }
             }
             if (op->type == "aten::rsub")
             {
                 Operator* op0 = op->inputs[0]->producer;
                 Operator* op1 = op->inputs[1]->producer;
-                Operator* op2 = op->inputs[2]->producer;
 
-                if (operator_is_all_constant(op0, 0.f, 0) && operator_is_all_constant(op2, 1.f, 1))
+                if (op->inputs.size() == 2)
                 {
-                    // x <= b * 1 - 0
-                    need_eliminate = true;
-                    identity_input_id = 1;
+                    if (operator_is_all_constant(op0, 0.f, 0))
+                    {
+                        // x <= b - 0
+                        need_eliminate = true;
+                        identity_input_id = 1;
+                    }
                 }
-                else if (operator_is_all_constant(op0, 0.f, 0) && operator_is_all_constant(op1, 1.f, 1))
+                else // if (op->inputs.size() == 3)
                 {
-                    // x <= 1 * c - 0
-                    need_eliminate = true;
-                    identity_input_id = 2;
+                    Operator* op2 = op->inputs[2]->producer;
+
+                    if (operator_is_all_constant(op0, 0.f, 0) && operator_is_all_constant(op2, 1.f, 1))
+                    {
+                        // x <= b * 1 - 0
+                        need_eliminate = true;
+                        identity_input_id = 1;
+                    }
+                    else if (operator_is_all_constant(op0, 0.f, 0) && operator_is_all_constant(op1, 1.f, 1))
+                    {
+                        // x <= 1 * c - 0
+                        need_eliminate = true;
+                        identity_input_id = 2;
+                    }
                 }
             }
             if (op->type == "aten::mul")
@@ -265,6 +310,12 @@ void eliminate_noop_math(Graph& graph)
                 }
             }
 
+            // but if shape changes
+            if (need_eliminate && op->inputs[identity_input_id]->shape != op->outputs[0]->shape)
+            {
+                need_eliminate = false;
+            }
+
             if (!need_eliminate)
                 continue;
 
diff --git a/tools/pnnx/src/pass_level3/fuse_expression.cpp b/tools/pnnx/src/pass_level3/fuse_expression.cpp
index d828c8c08f4..2ed20abe7ce 100644
--- a/tools/pnnx/src/pass_level3/fuse_expression.cpp
+++ b/tools/pnnx/src/pass_level3/fuse_expression.cpp
@@ -62,7 +62,7 @@ static bool operand_maybe_tensor(const Operand* operand)
         return operand_maybe_tensor(op->inputs[0]);
     }
 
-    if (op->type == "aten::to" || op->type == "aten::detach")
+    if (op->type == "Tensor.to" || op->type == "aten::detach")
     {
         return operand_maybe_tensor(op->inputs[0]);
     }
@@ -124,7 +124,10 @@ static bool operand_maybe_tensor(const Operand* operand)
 
     if (op->type == "aten::add" || op->type == "aten::sub" || op->type == "aten::rsub")
     {
-        return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]) || operand_maybe_tensor(op->inputs[2]);
+        if (op->inputs.size() == 2)
+            return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]);
+        else // if (op->inputs.size() == 3)
+            return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]) || operand_maybe_tensor(op->inputs[2]);
     }
 
     return true;
@@ -172,6 +175,34 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s
         {
             expr += param.s;
         }
+        else if (param.type == 5)
+        {
+            // ints
+            expr += "[";
+            for (int i = 0; i < (int)param.ai.size(); i++)
+            {
+                char tmp[32];
+                sprintf(tmp, "%d", param.ai[i]);
+                expr += tmp;
+                if (i != (int)param.ai.size() - 1)
+                    expr += ",";
+            }
+            expr += "]";
+        }
+        else if (param.type == 6)
+        {
+            // floats
+            expr += "[";
+            for (int i = 0; i < (int)param.af.size(); i++)
+            {
+                char tmp[32];
+                sprintf(tmp, "%e", param.af[i]);
+                expr += tmp;
+                if (i != (int)param.af.size() - 1)
+                    expr += ",";
+            }
+            expr += "]";
+        }
         else if (param.type == 10)
         {
             char tmp[32];
@@ -494,6 +525,15 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s
         {
             fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants, zip);
         }
+        else if (!operand_maybe_tensor(operand))
+        {
+            std::string dtype = op->params.at("dtype").s;
+
+            // torch.xxx
+            expr += dtype + "(";
+            fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants, zip);
+            expr += ")";
+        }
         else
         {
             goto DEFAULT;
@@ -578,22 +618,30 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s
         expr += ",";
 
         std::string expr1;
-        std::string expr2;
         fuse_expression(graph, op->inputs[1], expr1, inputs, foldable_constants, zip);
-        fuse_expression(graph, op->inputs[2], expr2, inputs, foldable_constants, zip);
 
-        if (expr2 == "1")
+        if (op->inputs.size() == 2)
         {
             expr += expr1;
         }
-        else
+        else // if (op->inputs.size() == 3)
         {
-            expr += ",";
-            expr += "mul(";
-            expr += expr1;
-            expr += ",";
-            expr += expr2;
-            expr += ")";
+            std::string expr2;
+            fuse_expression(graph, op->inputs[2], expr2, inputs, foldable_constants, zip);
+
+            if (expr2 == "1")
+            {
+                expr += expr1;
+            }
+            else
+            {
+                expr += ",";
+                expr += "mul(";
+                expr += expr1;
+                expr += ",";
+                expr += expr2;
+                expr += ")";
+            }
         }
 
         expr += ")";
@@ -602,22 +650,30 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s
     {
         expr += "sub(";
         std::string expr1;
-        std::string expr2;
         fuse_expression(graph, op->inputs[1], expr1, inputs, foldable_constants, zip);
-        fuse_expression(graph, op->inputs[2], expr2, inputs, foldable_constants, zip);
 
-        if (expr2 == "1")
+        if (op->inputs.size() == 2)
         {
             expr += expr1;
         }
-        else
+        else // if (op->inputs.size() == 3)
         {
-            expr += ",";
-            expr += "mul(";
-            expr += expr1;
-            expr += ",";
-            expr += expr2;
-            expr += ")";
+            std::string expr2;
+            fuse_expression(graph, op->inputs[2], expr2, inputs, foldable_constants, zip);
+
+            if (expr2 == "1")
+            {
+                expr += expr1;
+            }
+            else
+            {
+                expr += ",";
+                expr += "mul(";
+                expr += expr1;
+                expr += ",";
+                expr += expr2;
+                expr += ")";
+            }
         }
 
         expr += ",";
@@ -712,7 +768,8 @@ void fuse_expression(Graph& graph, const std::set<std::string>& foldable_constan
             {
                 // fuse noop type cast only
                 bool noop_to = (op->outputs[0]->type != -1) && (op->inputs[0]->type == op->outputs[0]->type);
-                need_fuse = noop_to;
+                bool is_scalar = !operand_maybe_tensor(op->outputs[0]);
+                need_fuse = noop_to || is_scalar;
             }
             if (op->type == "aten::detach" || op->type == "aten::ScalarImplicit")
             {
diff --git a/tools/pnnx/src/pass_level5.cpp b/tools/pnnx/src/pass_level5.cpp
index 43b720fc1e7..7b91eabfe82 100644
--- a/tools/pnnx/src/pass_level5.cpp
+++ b/tools/pnnx/src/pass_level5.cpp
@@ -52,6 +52,7 @@
 #include "pass_level5/fuse_static_batchnorm.h"
 #include "pass_level5/fuse_static_conv.h"
 #include "pass_level5/fuse_static_convtranspose.h"
+#include "pass_level5/fuse_static_embedding.h"
 #include "pass_level5/fuse_static_groupnorm.h"
 #include "pass_level5/fuse_static_instancenorm.h"
 #include "pass_level5/fuse_static_layernorm.h"
@@ -100,6 +101,7 @@ void pass_level5(Graph& g, const std::set<std::string>& foldable_constants, cons
     fuse_static_conv(g);
     fuse_static_convtranspose(g);
     fuse_static_linear(g);
+    fuse_static_embedding(g);
 
     fuse_conv1d_batchnorm1d(g);
     fuse_conv2d_batchnorm2d(g);
diff --git a/tools/pnnx/src/pass_level5/eliminate_identity_operator.cpp b/tools/pnnx/src/pass_level5/eliminate_identity_operator.cpp
index d32f1f795de..02040406e55 100644
--- a/tools/pnnx/src/pass_level5/eliminate_identity_operator.cpp
+++ b/tools/pnnx/src/pass_level5/eliminate_identity_operator.cpp
@@ -29,7 +29,7 @@ void eliminate_identity_operator(Graph& graph)
         {
             Operator* op0 = graph.ops[i];
 
-            if (op0->type == "pnnx.Input" || op0->type == "pnnx.Output" || op0->type == "pnnx.Attribute")
+            if (op0->type == "pnnx.Input" || op0->type == "pnnx.Output" || op0->type == "pnnx.Attribute" || op0->type == "torch.clone")
                 continue;
 
             Operator* op1 = 0;
@@ -38,7 +38,7 @@ void eliminate_identity_operator(Graph& graph)
             {
                 op1 = graph.ops[j];
 
-                if (op1->type == "pnnx.Input" || op1->type == "pnnx.Output" || op0->type == "pnnx.Attribute")
+                if (op1->type == "pnnx.Input" || op1->type == "pnnx.Output" || op0->type == "pnnx.Attribute" || op1->type == "torch.clone")
                     continue;
 
                 if (op0->type != op1->type)
@@ -71,7 +71,7 @@ void eliminate_identity_operator(Graph& graph)
             {
                 Operand* in0 = op0->inputs[j];
 
-                in0->consumers.erase(std::find(in0->consumers.begin(), in0->consumers.end(), op1));
+                in0->remove_consumer(op1);
             }
 
             int output_count = (int)op0->outputs.size();
diff --git a/tools/pnnx/src/pass_level5/eval_expression.cpp b/tools/pnnx/src/pass_level5/eval_expression.cpp
index 79f931ce612..10b38b9fcb8 100644
--- a/tools/pnnx/src/pass_level5/eval_expression.cpp
+++ b/tools/pnnx/src/pass_level5/eval_expression.cpp
@@ -181,7 +181,10 @@ static std::string eval_expression(const Operator* op)
                  || t == "square"
                  || t == "tan"
                  || t == "tanh"
-                 || t == "trunc")
+                 || t == "trunc"
+                 || t == "torch.bool"
+                 || t == "torch.float"
+                 || t == "torch.long")
         {
             std::string a = exprstack.top();
             exprstack.pop();
@@ -334,6 +337,31 @@ static std::string eval_expression(const Operator* op)
                     float r = trunc(af);
                     exprstack.push(std::to_string(r));
                 }
+                if (t == "torch.bool")
+                {
+                    int r = int(af);
+                    if (token_is_interger_literal(a))
+                    {
+                        r = std::stoi(a);
+                    }
+
+                    exprstack.push(r == 0 ? "False" : "True");
+                }
+                if (t == "torch.float")
+                {
+                    float r = af;
+                    exprstack.push(std::to_string(r));
+                }
+                if (t == "torch.long")
+                {
+                    long r = long(af);
+                    if (token_is_interger_literal(a))
+                    {
+                        r = std::stol(a);
+                    }
+
+                    exprstack.push(std::to_string(r));
+                }
             }
             else
             {
diff --git a/tools/pnnx/src/pass_level5/fuse_layernorm.cpp b/tools/pnnx/src/pass_level5/fuse_layernorm.cpp
index a723b441717..c52201f8922 100644
--- a/tools/pnnx/src/pass_level5/fuse_layernorm.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_layernorm.cpp
@@ -19,7 +19,7 @@
 #include <math.h>
 #include <string.h>
 
-#include <torch/csrc/api/include/torch/torch.h>
+#include <torch/csrc/api/include/torch/version.h>
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp
index e6f1489a900..38f1375445b 100644
--- a/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp
@@ -19,7 +19,7 @@
 #include <math.h>
 #include <string.h>
 
-#include <torch/csrc/api/include/torch/torch.h>
+#include <torch/csrc/api/include/torch/version.h>
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_level5/fuse_slice_copy.cpp b/tools/pnnx/src/pass_level5/fuse_slice_copy.cpp
index ce5a06710f9..eccae6f1154 100644
--- a/tools/pnnx/src/pass_level5/fuse_slice_copy.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_slice_copy.cpp
@@ -35,24 +35,21 @@ void fuse_slice_copy(Graph& graph)
                 continue;
 
             // collect slice / select op chain
-            std::stack<const Operator*> slice_select_ops;
+            std::stack<Operator*> slice_select_ops;
             int descent_dim_current = INT_MAX;
-            const Operand* in0 = op->inputs[0];
+            Operand* in0 = op->inputs[0];
             while (in0->producer->type == "Tensor.slice" || in0->producer->type == "Tensor.select")
             {
-                const Operator* sop = in0->producer;
+                Operator* sop = in0->producer;
                 if (sop->type == "Tensor.slice")
                 {
-                    if (sop->params.find("dims") == sop->params.end()
-                            || sop->params.find("starts") == sop->params.end()
-                            || sop->params.find("ends") == sop->params.end()
-                            || sop->params.find("steps") == sop->params.end())
+                    if (!sop->has_param("dim") && !sop->has_param("dims"))
                     {
-                        fprintf(stderr, "dynamic index in slice copy chain is not supported\n");
+                        fprintf(stderr, "dynamic dims in slice copy chain is not supported\n");
                         break;
                     }
 
-                    int dims0 = sop->params.at("dims").ai[0];
+                    int dims0 = sop->has_param("dim") ? sop->params.at("dim").i : sop->params.at("dims").ai[0];
                     if (descent_dim_current < dims0)
                     {
                         break;
@@ -63,10 +60,9 @@ void fuse_slice_copy(Graph& graph)
 
                 if (sop->type == "Tensor.select")
                 {
-                    if (sop->params.find("dim") == sop->params.end()
-                            || sop->params.find("index") == sop->params.end())
+                    if (!sop->has_param("dim"))
                     {
-                        fprintf(stderr, "dynamic index in select copy chain is not supported\n");
+                        fprintf(stderr, "dynamic dim in select copy chain is not supported\n");
                         break;
                     }
 
@@ -121,60 +117,12 @@ void fuse_slice_copy(Graph& graph)
                 break;
             }
 
-            const Operator* top_sop = slice_select_ops.top();
-
-            // construct one-step slice
-            std::vector<int> new_dims;
-            std::vector<int> new_starts;
-            std::vector<int> new_ends;
-            std::vector<int> new_steps;
-
-            int select_dims_offset = 0;
-            while (!slice_select_ops.empty())
-            {
-                const Operator* sop = slice_select_ops.top();
-                slice_select_ops.pop();
-
-                if (sop->type == "Tensor.slice")
-                {
-                    std::vector<int> dims = sop->params.at("dims").ai;
-                    std::vector<int> starts = sop->params.at("starts").ai;
-                    std::vector<int> ends = sop->params.at("ends").ai;
-                    std::vector<int> steps = sop->params.at("steps").ai;
-
-                    for (size_t j = 0; j < dims.size(); j++)
-                    {
-                        dims[j] += select_dims_offset;
-                    }
-
-                    new_dims.insert(new_dims.end(), dims.begin(), dims.end());
-                    new_starts.insert(new_starts.end(), starts.begin(), starts.end());
-                    new_ends.insert(new_ends.end(), ends.begin(), ends.end());
-                    new_steps.insert(new_steps.end(), steps.begin(), steps.end());
-                }
-                else if (sop->type == "Tensor.select")
-                {
-                    int dim = sop->params.at("dim").i;
-                    int index = sop->params.at("index").i;
-
-                    dim += select_dims_offset;
-                    int end = index + 1;
-                    if (index == -1)
-                        end = INT_MAX;
-
-                    new_dims.push_back(dim);
-                    new_starts.push_back(index);
-                    new_ends.push_back(end);
-                    new_steps.push_back(1);
-
-                    select_dims_offset += 1;
-                }
-            }
+            Operator* top_sop = slice_select_ops.top();
 
             op->type = "Tensor.slice_copy";
 
-            // insert clone before any slices
-            Operator* op_clone = graph.new_operator_before("Tensor.clone", op->name + "_ncnnclone", top_sop);
+            // insert clone just after the producer
+            Operator* op_clone = graph.new_operator_after("Tensor.clone", op->name + "_ncnnclone", top_sop->inputs[0]->producer);
             Operand* clone_out = graph.new_operand(op->name + "_ncnnclone_out");
 
             clone_out->type = top_sop->inputs[0]->type;
@@ -190,84 +138,88 @@ void fuse_slice_copy(Graph& graph)
             op->inputs[0] = clone_out;
             clone_out->consumers.push_back(op);
 
-            op->params["dims"] = new_dims;
-            op->params["starts"] = new_starts;
-            op->params["ends"] = new_ends;
-            op->params["steps"] = new_steps;
-
-            int input_rank = (int)op->inputs[0]->shape.size();
-            if (input_rank == 0)
+            if (top_sop->type == "Tensor.slice")
             {
-                // insert view_as(sliced) for different or unknown rank
-                Operator* op_slice = graph.new_operator_before("Tensor.slice", op->name + "_ncnnslice", op);
-                Operator* op_view_as = graph.new_operator_before("Tensor.view_as", op->name + "_ncnnview_as", op);
-
-                Operand* slice_out = graph.new_operand(op->name + "_ncnnslice_out");
-                Operand* view_as_out = graph.new_operand(op->name + "_ncnnview_as_out");
-
-                op_slice->params["dims"] = new_dims;
-                op_slice->params["starts"] = new_starts;
-                op_slice->params["ends"] = new_ends;
-                op_slice->params["steps"] = new_steps;
-
-                op_slice->inputs.push_back(op->inputs[0]);
-                op->inputs[0]->consumers.push_back(op_slice);
-
-                op_slice->outputs.push_back(slice_out);
-                slice_out->producer = op_slice;
-
-                op_view_as->inputs.push_back(op->inputs[1]);
-                op->inputs[1]->consumers.push_back(op_view_as);
-                op->inputs[1]->remove_consumer(op);
-                op_view_as->inputs.push_back(slice_out);
-                slice_out->consumers.push_back(op_view_as);
-
-                op_view_as->outputs.push_back(view_as_out);
-                view_as_out->producer = op_view_as;
+                // shadow slice params and inputs to slice_copy
+                op->params = top_sop->params;
+                // self op->inputs[0]
+                // src  op->inputs[1]
 
-                op->inputs[1] = view_as_out;
-                view_as_out->consumers.push_back(op);
+                if (top_sop->has_input("start"))
+                {
+                    Operand* start = top_sop->named_input("start");
+                    op->inputs.push_back(start);
+                    op->inputnames.push_back("start");
+                    start->consumers.push_back(op);
+                }
+                if (top_sop->has_input("starts"))
+                {
+                    Operand* starts = top_sop->named_input("starts");
+                    op->inputs.push_back(starts);
+                    op->inputnames.push_back("starts");
+                    starts->consumers.push_back(op);
+                }
+                if (top_sop->has_input("end"))
+                {
+                    Operand* end = top_sop->named_input("end");
+                    op->inputs.push_back(end);
+                    op->inputnames.push_back("end");
+                    end->consumers.push_back(op);
+                }
+                if (top_sop->has_input("ends"))
+                {
+                    Operand* ends = top_sop->named_input("ends");
+                    op->inputs.push_back(ends);
+                    op->inputnames.push_back("ends");
+                    ends->consumers.push_back(op);
+                }
+                if (top_sop->has_input("step"))
+                {
+                    Operand* step = top_sop->named_input("step");
+                    op->inputs.push_back(step);
+                    op->inputnames.push_back("step");
+                    step->consumers.push_back(op);
+                }
+                if (top_sop->has_input("steps"))
+                {
+                    Operand* steps = top_sop->named_input("steps");
+                    op->inputs.push_back(steps);
+                    op->inputnames.push_back("steps");
+                    steps->consumers.push_back(op);
+                }
+                if (top_sop->has_input("select"))
+                {
+                    Operand* select = top_sop->named_input("select");
+                    op->inputs.push_back(select);
+                    op->inputnames.push_back("select");
+                    select->consumers.push_back(op);
+                }
+                if (top_sop->has_input("selects"))
+                {
+                    Operand* selects = top_sop->named_input("selects");
+                    op->inputs.push_back(selects);
+                    op->inputnames.push_back("selects");
+                    selects->consumers.push_back(op);
+                }
             }
-            else if (input_rank != (int)op->inputs[1]->shape.size())
+            else // if (top_sop->type == "Tensor.select")
             {
-                // solve the target shape
-                std::vector<int> target_shape = op->inputs[0]->shape;
-                for (size_t j = 0; j < new_dims.size(); j++)
+                op->params["dim"] = top_sop->params.at("dim").i;
+                op->params["start"] = 0;
+                op->params["end"] = 0;
+                op->params["step"] = 0;
+
+                if (top_sop->has_param("index"))
                 {
-                    int dim = new_dims[j];
-                    int start = new_starts[j];
-                    int end = new_ends[j];
-                    int step = new_steps[j];
-
-                    if (dim < 0)
-                        dim = input_rank + dim;
-                    if (start < 0)
-                        start = target_shape[dim] + start;
-                    if (end < 0)
-                        end = target_shape[dim] + end;
-                    if (end == INT_MAX)
-                        end = target_shape[dim];
-
-                    target_shape[dim] = (end - start + (step - 1)) / step;
+                    op->params["select"] = top_sop->params.at("index").i;
+                }
+                else // if (top_sop->has_input("index"))
+                {
+                    Operand* index = top_sop->named_input("index");
+                    op->inputs.push_back(index);
+                    op->inputnames.push_back("select");
+                    index->consumers.push_back(op);
                 }
-
-                Operator* op_view = graph.new_operator_before("Tensor.view", op->name + "_ncnnview", op);
-                Operand* view_out = graph.new_operand(op->name + "_ncnnview_out");
-
-                op_view->params["shape"] = target_shape;
-
-                view_out->type = op->inputs[1]->type;
-                view_out->shape = target_shape;
-
-                op_view->inputs.push_back(op->inputs[1]);
-                op->inputs[1]->consumers.push_back(op_view);
-                op->inputs[1]->remove_consumer(op);
-
-                op_view->outputs.push_back(view_out);
-                view_out->producer = op_view;
-
-                op->inputs[1] = view_out;
-                view_out->consumers.push_back(op);
             }
 
             break;
diff --git a/tools/pnnx/src/pass_level5/fuse_slice_indices.cpp b/tools/pnnx/src/pass_level5/fuse_slice_indices.cpp
index 22be6e5b46f..332ca6b576d 100644
--- a/tools/pnnx/src/pass_level5/fuse_slice_indices.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_slice_indices.cpp
@@ -15,309 +15,580 @@
 #include "fuse_slice_indices.h"
 
 #include <string.h>
+#include <algorithm>
+#include <stack>
+#include <vector>
 #include "pass_level2.h"
 
 namespace pnnx {
 
-class fuse_slice_indices_pass : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-7 6
-pnnx.Input              input       0 1 input
-Tensor.slice            op_0        1 1 input a dim=%dim0 end=%end0 start=%start0 step=%step0
-Tensor.slice            op_1        1 1 a b dim=%dim1 end=%end1 start=%start1 step=%step1
-Tensor.slice            op_2        1 1 b c dim=%dim2 end=%end2 start=%start2 step=%step2
-Tensor.slice            op_3        1 1 c d dim=%dim3 end=%end3 start=%start3 step=%step3
-Tensor.slice            op_4        1 1 d out dim=%dim4 end=%end4 start=%start4 step=%step4
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Tensor.slice";
-    }
-
-    const char* name_str() const
-    {
-        return "slice";
-    }
-
-    bool match(const std::map<std::string, Parameter>& captured_params) const
-    {
-        int dim0 = captured_params.at("dim0").i;
-        int dim1 = captured_params.at("dim1").i;
-        int dim2 = captured_params.at("dim2").i;
-        int dim3 = captured_params.at("dim3").i;
-        int dim4 = captured_params.at("dim4").i;
-
-        return dim0 < dim1 && dim1 < dim2 && dim2 < dim3 && dim3 < dim4;
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
-    {
-        int dim0 = captured_params.at("dim0").i;
-        int dim1 = captured_params.at("dim1").i;
-        int dim2 = captured_params.at("dim2").i;
-        int dim3 = captured_params.at("dim3").i;
-        int dim4 = captured_params.at("dim4").i;
-
-        int start0 = captured_params.at("start0").i;
-        int start1 = captured_params.at("start1").i;
-        int start2 = captured_params.at("start2").i;
-        int start3 = captured_params.at("start3").i;
-        int start4 = captured_params.at("start4").i;
-
-        int end0 = captured_params.at("end0").i;
-        int end1 = captured_params.at("end1").i;
-        int end2 = captured_params.at("end2").i;
-        int end3 = captured_params.at("end3").i;
-        int end4 = captured_params.at("end4").i;
-
-        int step0 = captured_params.at("step0").i;
-        int step1 = captured_params.at("step1").i;
-        int step2 = captured_params.at("step2").i;
-        int step3 = captured_params.at("step3").i;
-        int step4 = captured_params.at("step4").i;
-
-        op->params["dims"] = Parameter{dim0, dim1, dim2, dim3, dim4};
-        op->params["starts"] = Parameter{start0, start1, start2, start3, start4};
-        op->params["ends"] = Parameter{end0, end1, end2, end3, end4};
-        op->params["steps"] = Parameter{step0, step1, step2, step3, step4};
-    }
-};
-
-class fuse_slice_indices_pass_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-6 5
-pnnx.Input              input       0 1 input
-Tensor.slice            op_0        1 1 input a dim=%dim0 end=%end0 start=%start0 step=%step0
-Tensor.slice            op_1        1 1 a b dim=%dim1 end=%end1 start=%start1 step=%step1
-Tensor.slice            op_2        1 1 b c dim=%dim2 end=%end2 start=%start2 step=%step2
-Tensor.slice            op_3        1 1 c out dim=%dim3 end=%end3 start=%start3 step=%step3
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Tensor.slice";
-    }
-
-    const char* name_str() const
-    {
-        return "slice";
-    }
-
-    bool match(const std::map<std::string, Parameter>& captured_params) const
-    {
-        int dim0 = captured_params.at("dim0").i;
-        int dim1 = captured_params.at("dim1").i;
-        int dim2 = captured_params.at("dim2").i;
-        int dim3 = captured_params.at("dim3").i;
-
-        return dim0 < dim1 && dim1 < dim2 && dim2 < dim3;
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
-    {
-        int dim0 = captured_params.at("dim0").i;
-        int dim1 = captured_params.at("dim1").i;
-        int dim2 = captured_params.at("dim2").i;
-        int dim3 = captured_params.at("dim3").i;
-
-        int start0 = captured_params.at("start0").i;
-        int start1 = captured_params.at("start1").i;
-        int start2 = captured_params.at("start2").i;
-        int start3 = captured_params.at("start3").i;
-
-        int end0 = captured_params.at("end0").i;
-        int end1 = captured_params.at("end1").i;
-        int end2 = captured_params.at("end2").i;
-        int end3 = captured_params.at("end3").i;
-
-        int step0 = captured_params.at("step0").i;
-        int step1 = captured_params.at("step1").i;
-        int step2 = captured_params.at("step2").i;
-        int step3 = captured_params.at("step3").i;
-
-        op->params["dims"] = Parameter{dim0, dim1, dim2, dim3};
-        op->params["starts"] = Parameter{start0, start1, start2, start3};
-        op->params["ends"] = Parameter{end0, end1, end2, end3};
-        op->params["steps"] = Parameter{step0, step1, step2, step3};
-    }
-};
-
-class fuse_slice_indices_pass_2 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input       0 1 input
-Tensor.slice            op_0        1 1 input a dim=%dim0 end=%end0 start=%start0 step=%step0
-Tensor.slice            op_1        1 1 a b dim=%dim1 end=%end1 start=%start1 step=%step1
-Tensor.slice            op_2        1 1 b out dim=%dim2 end=%end2 start=%start2 step=%step2
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Tensor.slice";
-    }
-
-    const char* name_str() const
-    {
-        return "slice";
-    }
-
-    bool match(const std::map<std::string, Parameter>& captured_params) const
-    {
-        int dim0 = captured_params.at("dim0").i;
-        int dim1 = captured_params.at("dim1").i;
-        int dim2 = captured_params.at("dim2").i;
-
-        return dim0 < dim1 && dim1 < dim2;
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
-    {
-        int dim0 = captured_params.at("dim0").i;
-        int dim1 = captured_params.at("dim1").i;
-        int dim2 = captured_params.at("dim2").i;
-
-        int start0 = captured_params.at("start0").i;
-        int start1 = captured_params.at("start1").i;
-        int start2 = captured_params.at("start2").i;
-
-        int end0 = captured_params.at("end0").i;
-        int end1 = captured_params.at("end1").i;
-        int end2 = captured_params.at("end2").i;
-
-        int step0 = captured_params.at("step0").i;
-        int step1 = captured_params.at("step1").i;
-        int step2 = captured_params.at("step2").i;
-
-        op->params["dims"] = Parameter{dim0, dim1, dim2};
-        op->params["starts"] = Parameter{start0, start1, start2};
-        op->params["ends"] = Parameter{end0, end1, end2};
-        op->params["steps"] = Parameter{step0, step1, step2};
-    }
-};
-
-class fuse_slice_indices_pass_3 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input       0 1 input
-Tensor.slice            op_0        1 1 input a dim=%dim0 end=%end0 start=%start0 step=%step0
-Tensor.slice            op_1        1 1 a out dim=%dim1 end=%end1 start=%start1 step=%step1
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Tensor.slice";
-    }
-
-    const char* name_str() const
-    {
-        return "slice";
-    }
-
-    bool match(const std::map<std::string, Parameter>& captured_params) const
-    {
-        int dim0 = captured_params.at("dim0").i;
-        int dim1 = captured_params.at("dim1").i;
-
-        return dim0 < dim1;
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
-    {
-        int dim0 = captured_params.at("dim0").i;
-        int dim1 = captured_params.at("dim1").i;
-
-        int start0 = captured_params.at("start0").i;
-        int start1 = captured_params.at("start1").i;
-
-        int end0 = captured_params.at("end0").i;
-        int end1 = captured_params.at("end1").i;
-
-        int step0 = captured_params.at("step0").i;
-        int step1 = captured_params.at("step1").i;
-
-        op->params["dims"] = Parameter{dim0, dim1};
-        op->params["starts"] = Parameter{start0, start1};
-        op->params["ends"] = Parameter{end0, end1};
-        op->params["steps"] = Parameter{step0, step1};
-    }
-};
-
-class fuse_slice_indices_pass_4 : public GraphRewriterPass
+void fuse_slice_indices(Graph& graph)
 {
-public:
-    const char* match_pattern_graph() const
+    while (1)
     {
-        return R"PNNXIR(7767517
-3 2
-pnnx.Input              input       0 1 input
-Tensor.slice            op_0        1 1 input out dim=%dim0 end=%end0 start=%start0 step=%step0
-pnnx.Output             output      1 0 out
-)PNNXIR";
+        bool matched = false;
+
+        for (int i = (int)graph.ops.size() - 1; i >= 0; i--)
+        {
+            Operator* op = graph.ops[i];
+
+            if (op->type != "Tensor.slice" && op->type != "Tensor.select")
+                continue;
+
+            if (op->has_param("dims"))
+            {
+                // skip fused ones
+                continue;
+            }
+
+            if (!op->has_param("dim"))
+            {
+                fprintf(stderr, "dynamic dim in slice/select chain is not supported\n");
+                continue;
+            }
+
+            bool static_starts = true;
+            bool static_ends = true;
+            bool static_steps = true;
+            bool static_selects = true;
+
+            if (op->type == "Tensor.slice")
+            {
+                if (!op->has_param("start")) static_starts = false;
+                if (!op->has_param("end")) static_ends = false;
+                if (!op->has_param("step")) static_steps = false;
+            }
+            else // if (op->type == "Tensor.select")
+            {
+                if (!op->has_param("index")) static_selects = false;
+            }
+
+            int descent_dim_current = op->params.at("dim").i;
+
+            // collect slice op chain
+            std::stack<Operator*> slice_select_ops;
+            Operator* top_sop = op;
+            Operand* in0 = op->inputs[0];
+            while (in0->producer->type == "Tensor.slice" || in0->producer->type == "Tensor.select")
+            {
+                Operator* sop = in0->producer;
+                if (in0->consumers.size() != 1)
+                {
+                    // not single chain
+                    break;
+                }
+
+                if (sop->has_param("dims"))
+                {
+                    // skip fused ones
+                    break;
+                }
+
+                if (!sop->has_param("dim"))
+                {
+                    fprintf(stderr, "dynamic dim in slice/select chain is not supported\n");
+                    break;
+                }
+
+                if (sop->type == "Tensor.slice")
+                {
+                    if (!sop->has_param("start")) static_starts = false;
+                    if (!sop->has_param("end")) static_ends = false;
+                    if (!sop->has_param("step")) static_steps = false;
+                }
+                else // if (sop->type == "Tensor.select")
+                {
+                    if (!sop->has_param("index")) static_selects = false;
+                }
+
+                int dim = sop->params.at("dim").i;
+
+                if (dim < 0 && descent_dim_current <= dim)
+                {
+                    // not adjacent slice/select in chain
+                    break;
+                }
+
+                if (dim < 0 && descent_dim_current >= 0)
+                {
+                    // not adjacent slice/select in chain
+                    break;
+                }
+
+                // only allow select on same dim
+                if (sop->type == "Tensor.select")
+                {
+                    if (descent_dim_current >= 0 && dim >= 0 && descent_dim_current < dim)
+                    {
+                        // not adjacent slice/select in chain
+                        break;
+                    }
+                }
+                else
+                {
+                    if (descent_dim_current >= 0 && dim >= 0 && descent_dim_current <= dim)
+                    {
+                        // not adjacent slice/select in chain
+                        break;
+                    }
+                }
+
+                descent_dim_current = dim;
+
+                slice_select_ops.push(sop);
+                top_sop = sop;
+                in0 = sop->inputs[0];
+            }
+
+            if (slice_select_ops.empty())
+            {
+                // single orphaned slice/select
+                continue;
+            }
+
+            matched = true;
+
+            // construct one-step slice
+            std::vector<int> new_dims;
+            std::vector<int> new_starts;
+            std::vector<int> new_ends;
+            std::vector<int> new_steps;
+            std::vector<int> new_selects;
+            Operator* op_starts = 0;
+            Operator* op_ends = 0;
+            Operator* op_steps = 0;
+            Operator* op_selects = 0;
+            if (!static_starts) op_starts = graph.new_operator_before("pnnx.SliceIndexes", op->name + "_ncnnstarts", op);
+            if (!static_ends) op_ends = graph.new_operator_before("pnnx.SliceIndexes", op->name + "_ncnnends", op);
+            if (!static_steps) op_steps = graph.new_operator_before("pnnx.SliceIndexes", op->name + "_ncnnsteps", op);
+            if (!static_selects) op_selects = graph.new_operator_before("pnnx.SliceIndexes", op->name + "_ncnnselects", op);
+
+            std::vector<std::string> starts_indexes;
+            std::vector<std::string> ends_indexes;
+            std::vector<std::string> steps_indexes;
+            std::vector<std::string> selects_indexes;
+
+            int select_dim_offset = 0;
+
+            while (!slice_select_ops.empty())
+            {
+                Operator* sop = slice_select_ops.top();
+                slice_select_ops.pop();
+
+                new_dims.push_back(select_dim_offset + sop->params.at("dim").i);
+
+                if (sop->type == "Tensor.slice")
+                {
+                    if (static_starts)
+                    {
+                        new_starts.push_back(sop->params.at("start").type == 0 ? 0 : sop->params.at("start").i);
+                    }
+                    else if (sop->has_param("start"))
+                    {
+                        char tmp[32];
+                        if (sop->params.at("start").type == 0)
+                        {
+                            sprintf(tmp, "0");
+                        }
+                        else
+                        {
+                            sprintf(tmp, "%d", sop->params.at("start").i);
+                        }
+                        starts_indexes.push_back(tmp);
+                    }
+                    else
+                    {
+                        char tmp[32];
+                        sprintf(tmp, "@%d", (int)op_starts->inputs.size());
+                        starts_indexes.push_back(tmp);
+                        Operand* start = sop->named_input("start");
+                        op_starts->inputs.push_back(start);
+                        start->remove_consumer(sop);
+                        start->consumers.push_back(op_starts);
+                    }
+
+                    if (static_ends)
+                    {
+                        new_ends.push_back(sop->params.at("end").type == 0 ? INT_MAX : sop->params.at("end").i);
+                    }
+                    else if (sop->has_param("end"))
+                    {
+                        char tmp[32];
+                        if (sop->params.at("end").type == 0)
+                        {
+                            sprintf(tmp, "%d", INT_MAX);
+                        }
+                        else
+                        {
+                            sprintf(tmp, "%d", sop->params.at("end").i);
+                        }
+                        ends_indexes.push_back(tmp);
+                    }
+                    else
+                    {
+                        char tmp[32];
+                        sprintf(tmp, "@%d", (int)op_ends->inputs.size());
+                        ends_indexes.push_back(tmp);
+                        Operand* end = sop->named_input("end");
+                        op_ends->inputs.push_back(end);
+                        end->remove_consumer(sop);
+                        end->consumers.push_back(op_ends);
+                    }
+
+                    if (static_steps)
+                    {
+                        new_steps.push_back(sop->params.at("step").type == 0 ? 1 : sop->params.at("step").i);
+                    }
+                    else if (sop->has_param("step"))
+                    {
+                        char tmp[32];
+                        if (sop->params.at("step").type == 0)
+                        {
+                            sprintf(tmp, "1");
+                        }
+                        else
+                        {
+                            sprintf(tmp, "%d", sop->params.at("step").i);
+                        }
+                        steps_indexes.push_back(tmp);
+                    }
+                    else
+                    {
+                        char tmp[32];
+                        sprintf(tmp, "@%d", (int)op_steps->inputs.size());
+                        steps_indexes.push_back(tmp);
+                        Operand* step = sop->named_input("step");
+                        op_steps->inputs.push_back(step);
+                        step->remove_consumer(sop);
+                        step->consumers.push_back(op_steps);
+                    }
+
+                    if (static_selects)
+                    {
+                        new_selects.push_back(INT_MAX);
+                    }
+                    else
+                    {
+                        char tmp[32];
+                        sprintf(tmp, "%d", INT_MAX);
+                        selects_indexes.push_back(tmp);
+                    }
+                }
+                else // if (sop->type == "Tensor.select")
+                {
+                    if (static_starts)
+                    {
+                        new_starts.push_back(0);
+                    }
+                    else
+                    {
+                        starts_indexes.push_back("0");
+                    }
+
+                    if (static_ends)
+                    {
+                        new_ends.push_back(0);
+                    }
+                    else
+                    {
+                        ends_indexes.push_back("0");
+                    }
+
+                    if (static_steps)
+                    {
+                        new_steps.push_back(0);
+                    }
+                    else
+                    {
+                        steps_indexes.push_back("0");
+                    }
+
+                    if (static_selects)
+                    {
+                        new_selects.push_back(sop->params.at("index").type == 0 ? 0 : sop->params.at("index").i);
+                    }
+                    else if (sop->has_param("index"))
+                    {
+                        char tmp[32];
+                        if (sop->params.at("index").type == 0)
+                        {
+                            sprintf(tmp, "0");
+                        }
+                        else
+                        {
+                            sprintf(tmp, "%d", sop->params.at("index").i);
+                        }
+                        selects_indexes.push_back(tmp);
+                    }
+                    else
+                    {
+                        char tmp[32];
+                        sprintf(tmp, "@%d", (int)op_selects->inputs.size());
+                        selects_indexes.push_back(tmp);
+                        Operand* index = sop->named_input("index");
+                        op_selects->inputs.push_back(index);
+                        index->remove_consumer(sop);
+                        index->consumers.push_back(op_selects);
+                    }
+
+                    select_dim_offset += 1;
+                }
+
+                {
+                    // drop sop and sop output
+                    Operand* sop_out = sop->outputs[0];
+
+                    graph.operands.erase(std::find(graph.operands.begin(), graph.operands.end(), sop_out));
+
+                    delete sop_out;
+
+                    graph.ops.erase(std::find(graph.ops.begin(), graph.ops.end(), sop));
+
+                    delete sop;
+                }
+            }
+
+            new_dims.push_back(select_dim_offset + op->params.at("dim").i);
+
+            if (op->type == "Tensor.slice")
+            {
+                if (static_starts)
+                {
+                    new_starts.push_back(op->params.at("start").type == 0 ? 0 : op->params.at("start").i);
+                }
+                else if (op->has_param("start"))
+                {
+                    char tmp[32];
+                    if (op->params.at("start").type == 0)
+                    {
+                        sprintf(tmp, "0");
+                    }
+                    else
+                    {
+                        sprintf(tmp, "%d", op->params.at("start").i);
+                    }
+                    starts_indexes.push_back(tmp);
+                }
+                else
+                {
+                    char tmp[32];
+                    sprintf(tmp, "@%d", (int)op_starts->inputs.size());
+                    starts_indexes.push_back(tmp);
+                    Operand* start = op->named_input("start");
+                    op_starts->inputs.push_back(start);
+                    start->remove_consumer(op);
+                    start->consumers.push_back(op_starts);
+                }
+
+                if (static_ends)
+                {
+                    new_ends.push_back(op->params.at("end").type == 0 ? INT_MAX : op->params.at("end").i);
+                }
+                else if (op->has_param("end"))
+                {
+                    char tmp[32];
+                    if (op->params.at("end").type == 0)
+                    {
+                        sprintf(tmp, "%d", INT_MAX);
+                    }
+                    else
+                    {
+                        sprintf(tmp, "%d", op->params.at("end").i);
+                    }
+                    ends_indexes.push_back(tmp);
+                }
+                else
+                {
+                    char tmp[32];
+                    sprintf(tmp, "@%d", (int)op_ends->inputs.size());
+                    ends_indexes.push_back(tmp);
+                    Operand* end = op->named_input("end");
+                    op_ends->inputs.push_back(end);
+                    end->remove_consumer(op);
+                    end->consumers.push_back(op_ends);
+                }
+
+                if (static_steps)
+                {
+                    new_steps.push_back(op->params.at("step").type == 0 ? 1 : op->params.at("step").i);
+                }
+                else if (op->has_param("step"))
+                {
+                    char tmp[32];
+                    if (op->params.at("step").type == 0)
+                    {
+                        sprintf(tmp, "1");
+                    }
+                    else
+                    {
+                        sprintf(tmp, "%d", op->params.at("step").i);
+                    }
+                    steps_indexes.push_back(tmp);
+                }
+                else
+                {
+                    char tmp[32];
+                    sprintf(tmp, "@%d", (int)op_steps->inputs.size());
+                    steps_indexes.push_back(tmp);
+                    Operand* step = op->named_input("step");
+                    op_steps->inputs.push_back(step);
+                    step->remove_consumer(op);
+                    step->consumers.push_back(op_steps);
+                }
+
+                if (static_selects)
+                {
+                    new_selects.push_back(INT_MAX);
+                }
+                else if (op->has_param("index"))
+                {
+                    char tmp[32];
+                    sprintf(tmp, "%d", INT_MAX);
+                    selects_indexes.push_back(tmp);
+                }
+            }
+            else // if (op->type == "Tensor.select")
+            {
+                if (static_starts)
+                {
+                    new_starts.push_back(0);
+                }
+                else
+                {
+                    starts_indexes.push_back("0");
+                }
+
+                if (static_ends)
+                {
+                    new_ends.push_back(0);
+                }
+                else
+                {
+                    ends_indexes.push_back("0");
+                }
+
+                if (static_steps)
+                {
+                    new_steps.push_back(0);
+                }
+                else
+                {
+                    steps_indexes.push_back("0");
+                }
+
+                if (static_selects)
+                {
+                    new_selects.push_back(op->params.at("index").type == 0 ? 0 : op->params.at("index").i);
+                }
+                else if (op->has_param("index"))
+                {
+                    char tmp[32];
+                    if (op->params.at("index").type == 0)
+                    {
+                        sprintf(tmp, "0");
+                    }
+                    else
+                    {
+                        sprintf(tmp, "%d", op->params.at("index").i);
+                    }
+                    selects_indexes.push_back(tmp);
+                }
+                else
+                {
+                    char tmp[32];
+                    sprintf(tmp, "@%d", (int)op_selects->inputs.size());
+                    selects_indexes.push_back(tmp);
+                    Operand* index = op->named_input("index");
+                    op_selects->inputs.push_back(index);
+                    index->remove_consumer(op);
+                    index->consumers.push_back(op_selects);
+                }
+            }
+
+            op->type = "Tensor.slice";
+
+            op->params.clear();
+            op->params["dims"] = new_dims;
+
+            op->inputs.clear();
+            op->inputnames.clear();
+
+            op->inputs.push_back(in0);
+            op->inputnames.push_back("input");
+
+            in0->remove_consumer(top_sop);
+            in0->consumers.push_back(op);
+
+            if (static_starts)
+            {
+                op->params["starts"] = new_starts;
+            }
+            else
+            {
+                op_starts->params["indexes"] = starts_indexes;
+
+                Operand* starts_out = graph.new_operand(op->name + "_ncnnstarts_out");
+                starts_out->producer = op_starts;
+                op_starts->outputs.push_back(starts_out);
+                starts_out->consumers.push_back(op);
+                op->inputs.push_back(starts_out);
+                op->inputnames.push_back("starts");
+            }
+
+            if (static_ends)
+            {
+                op->params["ends"] = new_ends;
+            }
+            else
+            {
+                op_ends->params["indexes"] = ends_indexes;
+
+                Operand* ends_out = graph.new_operand(op->name + "_ncnnends_out");
+                ends_out->producer = op_ends;
+                op_ends->outputs.push_back(ends_out);
+                ends_out->consumers.push_back(op);
+                op->inputs.push_back(ends_out);
+                op->inputnames.push_back("ends");
+            }
+
+            if (static_steps)
+            {
+                op->params["steps"] = new_steps;
+            }
+            else
+            {
+                op_steps->params["indexes"] = steps_indexes;
+
+                Operand* steps_out = graph.new_operand(op->name + "_ncnnsteps_out");
+                steps_out->producer = op_steps;
+                op_steps->outputs.push_back(steps_out);
+                steps_out->consumers.push_back(op);
+                op->inputs.push_back(steps_out);
+                op->inputnames.push_back("steps");
+            }
+
+            if (static_selects)
+            {
+                op->params["selects"] = new_selects;
+            }
+            else
+            {
+                op_selects->params["indexes"] = selects_indexes;
+
+                Operand* selects_out = graph.new_operand(op->name + "_ncnnselects_out");
+                selects_out->producer = op_selects;
+                op_selects->outputs.push_back(selects_out);
+                selects_out->consumers.push_back(op);
+                op->inputs.push_back(selects_out);
+                op->inputnames.push_back("selects");
+            }
+
+            break;
+        }
+
+        if (!matched)
+            break;
     }
-
-    const char* type_str() const
-    {
-        return "Tensor.slice";
-    }
-
-    const char* name_str() const
-    {
-        return "slice";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
-    {
-        int dim0 = captured_params.at("dim0").i;
-        int start0 = captured_params.at("start0").i;
-        int end0 = captured_params.at("end0").i;
-        int step0 = captured_params.at("step0").i;
-
-        op->params["dims"] = Parameter{dim0};
-        op->params["starts"] = Parameter{start0};
-        op->params["ends"] = Parameter{end0};
-        op->params["steps"] = Parameter{step0};
-    }
-};
-
-void fuse_slice_indices(Graph& graph)
-{
-    fuse_slice_indices_pass a;
-    fuse_slice_indices_pass_1 b;
-    fuse_slice_indices_pass_2 c;
-    fuse_slice_indices_pass_3 d;
-    fuse_slice_indices_pass_4 e;
-    int opindex = 0;
-
-    pnnx_graph_rewrite(graph, &a, opindex);
-    pnnx_graph_rewrite(graph, &b, opindex);
-    pnnx_graph_rewrite(graph, &c, opindex);
-    pnnx_graph_rewrite(graph, &d, opindex);
-    pnnx_graph_rewrite(graph, &e, opindex);
 }
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp b/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp
index 6ccacd5628f..4e767a02d00 100644
--- a/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp
@@ -109,6 +109,12 @@ void fuse_slice_to_tensor_split(Graph& graph)
                     full_dimsize_slice = true;
                     break;
                 }
+                if (!op_in->shape.empty() && end2 == op_in->shape[dim])
+                {
+                    slice_n_ops.push_back(op2);
+                    full_dimsize_slice = true;
+                    break;
+                }
 
                 tensor_split_indices.push_back(end2);
                 slice_n_ops.push_back(op2);
diff --git a/tools/pnnx/src/pass_level5/fuse_static_embedding.cpp b/tools/pnnx/src/pass_level5/fuse_static_embedding.cpp
new file mode 100644
index 00000000000..f5ad240f29d
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_embedding.cpp
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_static_embedding.h"
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class fuse_static_Fembedding_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @data=(%num_embeddings,%embedding_dim)f32
+F.embedding             op_0        2 1 input weight out scale_grad_by_freq=* sparse=%sparse
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.Embedding            embedding   1 1 input out num_embeddings=%num_embeddings embedding_dim=%embedding_dim sparse=%sparse @weight=%op_weight.data
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+void fuse_static_embedding(Graph& graph)
+{
+    fuse_static_Fembedding_pass a;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_embedding.h b/tools/pnnx/src/pass_level5/fuse_static_embedding.h
new file mode 100644
index 00000000000..3e53c86653a
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_embedding.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_static_embedding(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn.cpp b/tools/pnnx/src/pass_ncnn.cpp
index 5b082d6ff8b..6911fd17ce6 100644
--- a/tools/pnnx/src/pass_ncnn.cpp
+++ b/tools/pnnx/src/pass_ncnn.cpp
@@ -27,6 +27,8 @@
 #include "pass_ncnn/convert_torch_tensor_split.h"
 #include "pass_ncnn/convert_torch_unbind.h"
 #include "pass_ncnn/convert_Tensor_select.h"
+#include "pass_ncnn/convert_Tensor_slice.h"
+#include "pass_ncnn/convert_Tensor_slice_copy.h"
 #include "pass_ncnn/eliminate_output.h"
 #include "pass_ncnn/expand_expression.h"
 #include "pass_ncnn/fuse_convert_shufflechannel_slice.h"
@@ -105,6 +107,11 @@ void pass_ncnn(Graph& g, const std::vector<std::string>& module_operators)
     ncnn::convert_torch_einsum(g);
 
     ncnn::convert_Tensor_select(g);
+    ncnn::convert_Tensor_slice(g);
+    ncnn::convert_Tensor_slice_copy(g);
+
+    // slice        -> crop + reshape
+    // slice_copy   -> reshape + copyto
 
     int opindex = 0;
     for (auto x : g_global_pnnx_ncnn_graph_rewriter_passes)
@@ -115,9 +122,10 @@ void pass_ncnn(Graph& g, const std::vector<std::string>& module_operators)
         }
     }
 
+    ncnn::eliminate_noop(g);
+
     ncnn::insert_split(g);
 
-    ncnn::eliminate_noop(g);
     ncnn::fuse_transpose_matmul(g);
     ncnn::fuse_binaryop_eltwise(g);
     ncnn::fuse_convolution_activation(g);
diff --git a/tools/pnnx/src/pass_ncnn/F_conv1d.cpp b/tools/pnnx/src/pass_ncnn/F_conv1d.cpp
index c861842b95f..daa1eb0e24a 100644
--- a/tools/pnnx/src/pass_ncnn/F_conv1d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_conv1d.cpp
@@ -103,7 +103,7 @@ pnnx.Output             output      1 0 out
         std::vector<int> weight_shape = op->inputs[1]->shape;
         if (weight_shape.empty())
         {
-            weight_shape = {0, 0, 0, 0};
+            weight_shape = {0, 0, 0};
         }
 
         op->params["0"] = weight_shape[0];
@@ -158,7 +158,7 @@ pnnx.Output             output      1 0 out
         std::vector<int> weight_shape = op->inputs[1]->shape;
         if (weight_shape.empty())
         {
-            weight_shape = {0, 0, 0, 0};
+            weight_shape = {0, 0, 0};
         }
 
         op->params["0"] = weight_shape[0];
@@ -215,7 +215,7 @@ pnnx.Output             output      1 0 out
         std::vector<int> weight_shape = op->inputs[1]->shape;
         if (weight_shape.empty())
         {
-            weight_shape = {0, 0, 0, 0};
+            weight_shape = {0, 0, 0};
         }
 
         op->params["0"] = weight_shape[0];
diff --git a/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp b/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp
deleted file mode 100644
index ecc36506e86..00000000000
--- a/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#include "pass_ncnn.h"
-
-#include <limits.h>
-
-namespace pnnx {
-
-namespace ncnn {
-
-class Tensor_slice : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-3 2
-pnnx.Input              input       0 1 input
-Tensor.slice            op_0        1 1 input out dims=%dims starts=%starts ends=%ends steps=%steps
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "Crop";
-    }
-
-    const char* name_str() const
-    {
-        return "slice";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
-    {
-        std::vector<int> axes = captured_params.at("dims").ai;
-        const std::vector<int>& starts = captured_params.at("starts").ai;
-        std::vector<int> ends = captured_params.at("ends").ai;
-        const std::vector<int>& steps = captured_params.at("steps").ai;
-        int axes_rank = axes.size();
-
-        for (int i = 0; i < axes_rank; i++)
-        {
-            if (steps[i] != 1)
-            {
-                fprintf(stderr, "slice with step %d is not supported\n", steps[i]);
-                return;
-            }
-        }
-
-        const int batch_index = op->inputs[0]->params["__batch_index"].i;
-
-        {
-            int input_rank = op->inputs[0]->shape.size();
-
-            if (batch_index >= 0 && batch_index < input_rank)
-                input_rank -= 1;
-
-            if (input_rank > 4)
-            {
-                fprintf(stderr, "slice %d-rank tensor with %d-rank axes is not possible!\n", input_rank, axes_rank);
-                return;
-            }
-        }
-
-        for (int i = 0; i < axes_rank; i++)
-        {
-            if (axes[i] == batch_index && (starts[i] != 0 || ends[i] != INT_MAX))
-            {
-                fprintf(stderr, "slice along batch axis is not supported\n");
-                return;
-            }
-
-            if (axes[i] < 0)
-            {
-                int input_rank = op->inputs[0]->shape.size();
-                axes[i] = input_rank + axes[i];
-            }
-
-            if (axes[i] > batch_index)
-                axes[i] -= 1;
-
-            if (ends[i] == INT_MAX)
-                ends[i] = -233;
-        }
-
-        op->params["9"] = starts;
-        op->params["10"] = ends;
-        op->params["11"] = axes;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(Tensor_slice, 20)
-
-} // namespace ncnn
-
-} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/Tensor_slice_copy.cpp b/tools/pnnx/src/pass_ncnn/Tensor_slice_copy.cpp
deleted file mode 100644
index c3066ca3418..00000000000
--- a/tools/pnnx/src/pass_ncnn/Tensor_slice_copy.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#include "pass_ncnn.h"
-
-#include <limits.h>
-
-namespace pnnx {
-
-namespace ncnn {
-
-class Tensor_slice_copy : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input_0     0 1 self
-pnnx.Input              input_1     0 1 src
-Tensor.slice_copy       op_0        2 1 self src out dims=%dims starts=%starts ends=%ends steps=%steps
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-
-    const char* type_str() const
-    {
-        return "CopyTo";
-    }
-
-    const char* name_str() const
-    {
-        return "slice_copy";
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
-    {
-        std::vector<int> axes = captured_params.at("dims").ai;
-        const std::vector<int>& starts = captured_params.at("starts").ai;
-        std::vector<int> ends = captured_params.at("ends").ai;
-        const std::vector<int>& steps = captured_params.at("steps").ai;
-        int axes_rank = axes.size();
-
-        for (int i = 0; i < axes_rank; i++)
-        {
-            if (steps[i] != 1)
-            {
-                fprintf(stderr, "slice_copy with step %d is not supported\n", steps[i]);
-                return;
-            }
-        }
-
-        const int batch_index = op->inputs[0]->params["__batch_index"].i;
-
-        {
-            int input_rank = op->inputs[0]->shape.size();
-
-            if (batch_index >= 0 && batch_index < input_rank)
-                input_rank -= 1;
-
-            if (input_rank > 4)
-            {
-                fprintf(stderr, "slice_copy %d-rank tensor with %d-rank axes is not possible!\n", input_rank, axes_rank);
-                return;
-            }
-        }
-
-        for (int i = 0; i < axes_rank; i++)
-        {
-            if (axes[i] == batch_index && (starts[i] != 0 || ends[i] != INT_MAX))
-            {
-                fprintf(stderr, "slice_copy along batch axis is not supported\n");
-                return;
-            }
-
-            if (axes[i] < 0)
-            {
-                int input_rank = op->inputs[0]->shape.size();
-                axes[i] = input_rank + axes[i];
-            }
-
-            if (axes[i] > batch_index)
-                axes[i] -= 1;
-
-            if (ends[i] == INT_MAX)
-                ends[i] = -233;
-        }
-
-        op->params["9"] = starts;
-        // op->params["10"] = ends; // ncnn always resolve ends from src blob
-        op->params["11"] = axes;
-    }
-};
-
-REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(Tensor_slice_copy, 20)
-
-} // namespace ncnn
-
-} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/convert_Tensor_slice.cpp b/tools/pnnx/src/pass_ncnn/convert_Tensor_slice.cpp
new file mode 100644
index 00000000000..75e9c5acc50
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/convert_Tensor_slice.cpp
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convert_Tensor_slice.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+void convert_Tensor_slice(Graph& graph)
+{
+    int op_index = 0;
+
+    while (1)
+    {
+        bool matched = false;
+
+        for (Operator* op : graph.ops)
+        {
+            if (op->type != "Tensor.slice")
+                continue;
+
+            matched = true;
+
+            op->type = "Crop";
+            op->name = std::string("slice_") + std::to_string(op_index++);
+
+            std::vector<int> axes;
+            std::vector<int> starts;
+            std::vector<int> ends;
+            std::vector<int> steps;
+            std::vector<int> selects;
+
+            if (op->has_param("dims"))
+            {
+                axes = op->params.at("dims").ai;
+            }
+            else if (op->has_param("dim"))
+            {
+                axes = std::vector<int> {op->params.at("dim").i};
+            }
+            else
+            {
+                fprintf(stderr, "slice with dynamic dim is not supported\n");
+                continue;
+            }
+
+            if (op->has_param("starts"))
+            {
+                starts = op->params.at("starts").ai;
+            }
+            else if (op->has_param("start"))
+            {
+                starts = std::vector<int> {op->params.at("start").i};
+            }
+            else
+            {
+                fprintf(stderr, "slice with dynamic start is not supported\n");
+                continue;
+            }
+
+            if (op->has_param("ends"))
+            {
+                ends = op->params.at("ends").ai;
+            }
+            else if (op->has_param("end"))
+            {
+                ends = std::vector<int> {op->params.at("end").i};
+            }
+            else
+            {
+                fprintf(stderr, "slice with dynamic end is not supported\n");
+                continue;
+            }
+
+            if (op->has_param("steps"))
+            {
+                steps = op->params.at("steps").ai;
+            }
+            else if (op->has_param("step"))
+            {
+                steps = std::vector<int> {op->params.at("step").i};
+            }
+            else
+            {
+                fprintf(stderr, "slice with dynamic step is not supported\n");
+                continue;
+            }
+
+            if (op->has_param("selects"))
+            {
+                selects = op->params.at("selects").ai;
+            }
+            else if (op->has_param("select"))
+            {
+                selects = std::vector<int> {op->params.at("select").i};
+            }
+            else if (op->has_input("selects") || op->has_input("select"))
+            {
+                fprintf(stderr, "slice with dynamic select is not supported\n");
+                continue;
+            }
+            else
+            {
+                // without select index
+            }
+
+            const int axes_rank = axes.size();
+
+            bool has_select = false;
+            for (int i = 0; i < axes_rank; i++)
+            {
+                if (steps[i] == 0)
+                {
+                    // simulate select as slice
+                    starts[i] = selects[i];
+                    ends[i] = selects[i] + 1;
+                    steps[i] = 1;
+                    has_select = true;
+                }
+                else if (steps[i] != 1)
+                {
+                    fprintf(stderr, "slice with step %d is not supported\n", steps[i]);
+                    continue;
+                }
+            }
+
+            const int batch_index = op->inputs[0]->params["__batch_index"].i;
+
+            {
+                int input_rank = op->inputs[0]->shape.size();
+
+                if (batch_index >= 0 && batch_index < input_rank)
+                    input_rank -= 1;
+
+                if (input_rank > 4)
+                {
+                    fprintf(stderr, "slice %d-rank tensor with %d-rank axes is not possible!\n", input_rank, axes_rank);
+                    continue;
+                }
+            }
+
+            for (int i = 0; i < axes_rank; i++)
+            {
+                if (axes[i] == batch_index && (starts[i] != 0 || ends[i] != INT_MAX))
+                {
+                    fprintf(stderr, "slice along batch axis is not supported\n");
+                    continue;
+                }
+
+                if (axes[i] < 0)
+                {
+                    int input_rank = op->inputs[0]->shape.size();
+                    axes[i] = input_rank + axes[i];
+                }
+
+                if (axes[i] > batch_index)
+                    axes[i] -= 1;
+
+                if (ends[i] == INT_MAX)
+                    ends[i] = -233;
+            }
+
+            op->params["9"] = starts;
+            op->params["10"] = ends;
+            op->params["11"] = axes;
+
+            op->params.erase("dim");
+            op->params.erase("dims");
+            op->params.erase("start");
+            op->params.erase("starts");
+            op->params.erase("end");
+            op->params.erase("ends");
+            op->params.erase("step");
+            op->params.erase("steps");
+            op->params.erase("select");
+            op->params.erase("selects");
+
+            // reshape for output, squeezing the slice dim
+            if (has_select)
+            {
+                Operand* out = op->outputs[0];
+
+                Operator* reshape = graph.new_operator_after("Tensor.reshape", op->name + "_ncnnreshape", op);
+
+                Operand* reshape_in = graph.new_operand(op->name + "_ncnnreshape_in");
+
+                reshape_in->params["__batch_index"] = batch_index;
+
+                reshape->inputs.push_back(reshape_in);
+                reshape->outputs.push_back(out);
+
+                op->outputs[0] = reshape_in;
+
+                out->producer = reshape;
+                reshape_in->producer = op;
+                reshape_in->consumers.push_back(reshape);
+
+                reshape->params["shape"] = out->shape;
+            }
+
+            break;
+        }
+
+        if (!matched)
+            break;
+    }
+}
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/convert_Tensor_slice.h b/tools/pnnx/src/pass_ncnn/convert_Tensor_slice.h
new file mode 100644
index 00000000000..00481577982
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/convert_Tensor_slice.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+void convert_Tensor_slice(Graph& graph);
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/convert_Tensor_slice_copy.cpp b/tools/pnnx/src/pass_ncnn/convert_Tensor_slice_copy.cpp
new file mode 100644
index 00000000000..42bddf35b29
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/convert_Tensor_slice_copy.cpp
@@ -0,0 +1,236 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convert_Tensor_slice_copy.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+void convert_Tensor_slice_copy(Graph& graph)
+{
+    int op_index = 0;
+
+    while (1)
+    {
+        bool matched = false;
+
+        for (Operator* op : graph.ops)
+        {
+            if (op->type != "Tensor.slice_copy")
+                continue;
+
+            matched = true;
+
+            op->type = "CopyTo";
+            op->name = std::string("slice_copy_") + std::to_string(op_index++);
+
+            std::vector<int> axes;
+            std::vector<int> starts;
+            std::vector<int> ends;
+            std::vector<int> steps;
+            std::vector<int> selects;
+
+            if (op->has_param("dims"))
+            {
+                axes = op->params.at("dims").ai;
+            }
+            else if (op->has_param("dim"))
+            {
+                axes = std::vector<int> {op->params.at("dim").i};
+            }
+            else
+            {
+                fprintf(stderr, "slice_copy with dynamic dim is not supported\n");
+                continue;
+            }
+
+            if (op->has_param("starts"))
+            {
+                starts = op->params.at("starts").ai;
+            }
+            else if (op->has_param("start"))
+            {
+                starts = std::vector<int> {op->params.at("start").i};
+            }
+            else
+            {
+                fprintf(stderr, "slice_copy with dynamic start is not supported\n");
+                continue;
+            }
+
+            if (op->has_param("ends"))
+            {
+                ends = op->params.at("ends").ai;
+            }
+            else if (op->has_param("end"))
+            {
+                ends = std::vector<int> {op->params.at("end").i};
+            }
+            else
+            {
+                fprintf(stderr, "slice_copy with dynamic end is not supported\n");
+                continue;
+            }
+
+            if (op->has_param("steps"))
+            {
+                steps = op->params.at("steps").ai;
+            }
+            else if (op->has_param("step"))
+            {
+                steps = std::vector<int> {op->params.at("step").i};
+            }
+            else
+            {
+                fprintf(stderr, "slice_copy with dynamic step is not supported\n");
+                continue;
+            }
+
+            if (op->has_param("selects"))
+            {
+                selects = op->params.at("selects").ai;
+            }
+            else if (op->has_param("select"))
+            {
+                selects = std::vector<int> {op->params.at("select").i};
+            }
+            else if (op->has_input("selects") || op->has_input("select"))
+            {
+                fprintf(stderr, "slice_copy with dynamic select is not supported\n");
+                continue;
+            }
+            else
+            {
+                // without select index
+            }
+
+            const int axes_rank = axes.size();
+
+            bool has_select = false;
+            std::vector<int> selected_axes;
+            for (int i = 0; i < axes_rank; i++)
+            {
+                if (steps[i] == 0)
+                {
+                    // simulate select as slice_copy
+                    starts[i] = selects[i];
+                    ends[i] = selects[i] + 1;
+                    steps[i] = 1;
+                    has_select = true;
+                    selected_axes.push_back(axes[i]);
+                }
+                else if (steps[i] != 1)
+                {
+                    fprintf(stderr, "slice_copy with step %d is not supported\n", steps[i]);
+                }
+            }
+
+            const int batch_index = op->inputs[0]->params["__batch_index"].i;
+
+            {
+                int input_rank = op->inputs[0]->shape.size();
+
+                if (batch_index >= 0 && batch_index < input_rank)
+                    input_rank -= 1;
+
+                if (input_rank > 4)
+                {
+                    fprintf(stderr, "slice_copy %d-rank tensor with %d-rank axes is not possible!\n", input_rank, axes_rank);
+                    continue;
+                }
+            }
+
+            for (int i = 0; i < axes_rank; i++)
+            {
+                if (axes[i] == batch_index && (starts[i] != 0 || ends[i] != INT_MAX))
+                {
+                    fprintf(stderr, "slice_copy along batch axis is not supported\n");
+                    continue;
+                }
+
+                if (axes[i] < 0)
+                {
+                    int input_rank = op->inputs[0]->shape.size();
+                    axes[i] = input_rank + axes[i];
+                }
+
+                if (axes[i] > batch_index)
+                    axes[i] -= 1;
+
+                if (ends[i] == INT_MAX)
+                    ends[i] = -233;
+            }
+
+            // op->params["9"] = starts;
+            // op->params["10"] = ends;
+            // op->params["11"] = axes;
+
+            op->params["9"] = starts;
+            // op->params["10"] = ends; // ncnn always resolve ends from src blob
+            op->params["11"] = axes;
+
+            op->params.erase("dim");
+            op->params.erase("dims");
+            op->params.erase("start");
+            op->params.erase("starts");
+            op->params.erase("end");
+            op->params.erase("ends");
+            op->params.erase("step");
+            op->params.erase("steps");
+            op->params.erase("select");
+            op->params.erase("selects");
+
+            // reshape for output, squeezing the slice_copy dim
+            if (has_select)
+            {
+                Operand* in = op->inputs[1];
+
+                Operator* reshape = graph.new_operator_before("Tensor.reshape", op->name + "_ncnnreshape", op);
+
+                Operand* reshape_out = graph.new_operand(op->name + "_ncnnreshape_out");
+
+                reshape_out->params["__batch_index"] = batch_index;
+
+                reshape->inputs.push_back(in);
+                reshape->outputs.push_back(reshape_out);
+
+                op->inputs[1] = reshape_out;
+
+                reshape_out->producer = reshape;
+                reshape_out->consumers.push_back(op);
+                in->remove_consumer(op);
+                in->consumers.push_back(reshape);
+
+                std::vector<int> shape = in->shape;
+                for (auto sa : selected_axes)
+                {
+                    // unsqueeze
+                    shape.insert(shape.begin() + sa, 1);
+                }
+
+                reshape->params["shape"] = shape;
+            }
+
+            break;
+        }
+
+        if (!matched)
+            break;
+    }
+}
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/convert_Tensor_slice_copy.h b/tools/pnnx/src/pass_ncnn/convert_Tensor_slice_copy.h
new file mode 100644
index 00000000000..241064a3b60
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/convert_Tensor_slice_copy.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+void convert_Tensor_slice_copy(Graph& graph);
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/convert_torch_stack.cpp b/tools/pnnx/src/pass_ncnn/convert_torch_stack.cpp
index aa56ab5f641..a9c0e9f7332 100644
--- a/tools/pnnx/src/pass_ncnn/convert_torch_stack.cpp
+++ b/tools/pnnx/src/pass_ncnn/convert_torch_stack.cpp
@@ -37,6 +37,7 @@ void convert_torch_stack(Graph& graph)
             op->name = std::string("stack_") + std::to_string(op_index++);
 
             const int batch_index = op->inputs[0]->params["__batch_index"].i;
+            const int input_rank = op->inputs[0]->shape.size();
 
             int axis = op->params.at("dim").i;
             if (axis == batch_index)
@@ -47,8 +48,7 @@ void convert_torch_stack(Graph& graph)
 
             if (axis < 0)
             {
-                int input_rank = op->inputs[0]->shape.size();
-                axis = input_rank + axis;
+                axis = input_rank + 1 + axis;
             }
 
             if (axis > batch_index)
@@ -56,16 +56,55 @@ void convert_torch_stack(Graph& graph)
 
             op->params["0"] = axis;
 
-            op->params.erase("dim");
-
-            // reshape for output, expand the stack dim
+            if (axis == input_rank)
+            {
+                // stack -> reshape(x,y,..,1) + concat
+                // reshape for input, expand the stack dim
+                for (size_t i = 0; i < op->inputs.size(); i++)
+                {
+                    Operand* in = op->inputs[i];
+
+                    Operator* reshape = graph.new_operator_before("Tensor.reshape", op->name + "_ncnnreshape_" + std::to_string(i), op);
+
+                    Operand* reshape_out = graph.new_operand(op->name + "_ncnnreshape_in");
+                    reshape_out->params["__batch_index"] = batch_index;
+
+                    reshape->inputs.push_back(in);
+                    reshape->outputs.push_back(reshape_out);
+
+                    op->inputs[i] = reshape_out;
+
+                    in->remove_consumer(op);
+                    in->consumers.push_back(reshape);
+                    reshape_out->producer = reshape;
+                    reshape_out->consumers.push_back(op);
+
+                    std::vector<int> shape = in->shape;
+                    if (shape.size() != 0)
+                    {
+                        shape.push_back(1);
+                        reshape->params["shape"] = shape;
+                    }
+                    reshape_out->shape = shape;
+                }
+            }
+            else
             {
+                // reshape for output, expand the stack dim
                 Operand* out = op->outputs[0];
 
                 Operator* reshape = graph.new_operator_after("Tensor.reshape", op->name + "_ncnnreshape", op);
 
                 Operand* reshape_in = graph.new_operand(op->name + "_ncnnreshape_in");
 
+                std::vector<int> shape = op->inputs[0]->shape;
+                if (shape.size() != 0)
+                {
+                    shape[op->params.at("dim").i] *= op->inputs.size();
+                    reshape_in->shape = shape;
+                }
+                reshape_in->params["__batch_index"] = batch_index;
+
                 reshape->inputs.push_back(reshape_in);
                 reshape->outputs.push_back(out);
 
@@ -78,6 +117,8 @@ void convert_torch_stack(Graph& graph)
                 reshape->params["shape"] = out->shape;
             }
 
+            op->params.erase("dim");
+
             break;
         }
 
diff --git a/tools/pnnx/src/pass_ncnn/eliminate_noop.cpp b/tools/pnnx/src/pass_ncnn/eliminate_noop.cpp
index 9c7c7d87f7c..2d30f60ba9f 100644
--- a/tools/pnnx/src/pass_ncnn/eliminate_noop.cpp
+++ b/tools/pnnx/src/pass_ncnn/eliminate_noop.cpp
@@ -35,21 +35,22 @@ void eliminate_noop(Graph& graph)
 
             need_eliminate = true;
 
-            op->inputs[0]->remove_consumer(op);
-
+            Operand* op_in = op->inputs[0];
             Operand* op_out = op->outputs[0];
 
-            op->inputs[0]->params = op_out->params;
+            op_in->remove_consumer(op);
+
+            op_in->params = op_out->params;
 
             for (auto& x : op_out->consumers)
             {
                 for (size_t j = 0; j < x->inputs.size(); j++)
                 {
                     if (x->inputs[j] == op_out)
-                        x->inputs[j] = op->inputs[0];
+                        x->inputs[j] = op_in;
                 }
 
-                op->inputs[0]->consumers.push_back(x);
+                op_in->consumers.push_back(x);
             }
 
             op_out->producer = 0;
diff --git a/tools/pnnx/src/pass_ncnn/insert_reshape_global_pooling.cpp b/tools/pnnx/src/pass_ncnn/insert_reshape_global_pooling.cpp
index cf8b3bcd29e..0b59ef16406 100644
--- a/tools/pnnx/src/pass_ncnn/insert_reshape_global_pooling.cpp
+++ b/tools/pnnx/src/pass_ncnn/insert_reshape_global_pooling.cpp
@@ -23,7 +23,7 @@ namespace ncnn {
 
 static bool is_known_operator_handle_flatten_0(const Operator* op)
 {
-    // opeartors that have similiar behavior for (1,c,1,1,1)/(1,c,1,1)/(1,c,1) and (1,c)
+    // opeartors that have similar behavior for (1,c,1,1,1)/(1,c,1,1)/(1,c,1) and (1,c)
 
     static const char* operator_handle_flatten_0[] = {
         "F.batch_norm",
diff --git a/tools/pnnx/src/pass_ncnn/nn_GELU.cpp b/tools/pnnx/src/pass_ncnn/nn_GELU.cpp
index bec078bbeb2..bce1cc09202 100644
--- a/tools/pnnx/src/pass_ncnn/nn_GELU.cpp
+++ b/tools/pnnx/src/pass_ncnn/nn_GELU.cpp
@@ -44,6 +44,32 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_GELU, 20)
 
+class nn_GELU_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.GELU                 op_0        1 1 input out approximate=*
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "GELU";
+    }
+
+    const char* name_str() const
+    {
+        return "gelu";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_GELU_1, 20)
+
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
index 0b0390a5cef..258b26176bc 100644
--- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
+++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
@@ -40,6 +40,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "F.conv1d",
         "F.conv2d",
         "F.conv3d",
+        "F.embedding",
         "F.fold",
         "F.grid_sample",
         "F.group_norm",
@@ -82,6 +83,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "nn.ConvTranspose1d",
         "nn.ConvTranspose2d",
         "nn.ConvTranspose3d",
+        "nn.Embedding",
         "nn.Fold",
         "nn.GroupNorm",
         "nn.InstanceNorm1d",
@@ -165,6 +167,32 @@ static void solve_batch_index_forward(Operand* operand)
                 solve_batch_index_backward(r);
             }
         }
+        else if (op->type == "torch.transpose")
+        {
+            const int dim0 = op->params.at("dim0").i;
+            const int dim1 = op->params.at("dim1").i;
+
+            int batch_index_transposed = batch_index;
+            if (dim0 == batch_index)
+            {
+                batch_index_transposed = dim1;
+            }
+            else if (dim1 == batch_index)
+            {
+                batch_index_transposed = dim0;
+            }
+
+            for (Operand* r : op->outputs)
+            {
+                if (r->params.find("__batch_index") != r->params.end())
+                    continue;
+
+                r->params["__batch_index"] = batch_index_transposed;
+
+                solve_batch_index_forward(r);
+                solve_batch_index_backward(r);
+            }
+        }
         else if (op->type == "Tensor.reshape" || op->type == "Tensor.view")
         {
             if (op->params.find("shape") == op->params.end())
@@ -239,6 +267,32 @@ static void solve_batch_index_backward(Operand* operand)
             solve_batch_index_forward(r);
         }
     }
+    else if (op->type == "torch.transpose")
+    {
+        const int dim0 = op->params.at("dim0").i;
+        const int dim1 = op->params.at("dim1").i;
+
+        int batch_index_transposed = batch_index;
+        if (dim0 == batch_index)
+        {
+            batch_index_transposed = dim1;
+        }
+        else if (dim1 == batch_index)
+        {
+            batch_index_transposed = dim0;
+        }
+
+        for (Operand* r : op->inputs)
+        {
+            if (r->params.find("__batch_index") != r->params.end())
+                continue;
+
+            r->params["__batch_index"] = batch_index_transposed;
+
+            solve_batch_index_backward(r);
+            solve_batch_index_forward(r);
+        }
+    }
     else if (op->type == "Tensor.reshape" || op->type == "Tensor.view")
     {
         if (op->params.find("shape") == op->params.end())
diff --git a/tools/pnnx/src/pass_ncnn/torch_slice_scatter.cpp b/tools/pnnx/src/pass_ncnn/torch_slice_scatter.cpp
new file mode 100644
index 00000000000..e8ef351ca07
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/torch_slice_scatter.cpp
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class torch_slice_scatter : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 src
+torch.slice_scatter     op_0        2 1 input src out dim=%dim start=%start end=%end step=%step
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "CopyTo";
+    }
+
+    const char* name_str() const
+    {
+        return "slice_scatter";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const int batch_index = op->inputs[0]->params["__batch_index"].i;
+
+        int dim = captured_params.at("dim").i;
+        if (dim == batch_index)
+        {
+            fprintf(stderr, "slice_scatter batch dim %d is not supported yet!\n", batch_index);
+            return;
+        }
+
+        int start = captured_params.at("start").type == 2 ? captured_params.at("start").i : 0;
+        // int end = captured_params.at("end").type == 2 ? captured_params.at("end").i : INT_MAX;
+        int step = captured_params.at("step").type == 2 ? captured_params.at("step").i : 1;
+        if (step != 1)
+        {
+            fprintf(stderr, "slice_scatter step %d is not supported yet!\n", step);
+        }
+
+        int input_rank = op->inputs[0]->shape.size();
+
+        if (input_rank > 5)
+        {
+            fprintf(stderr, "slice_scatter %d-rank tensor is not supported yet!\n", input_rank);
+            return;
+        }
+
+        if (dim > batch_index)
+            dim -= 1;
+
+        op->params["9"] = std::vector<int>{start};
+        // op->params["10"] = ends; // ncnn always resolve ends from src blob
+        op->params["11"] = std::vector<int>{dim};
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_slice_scatter, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx.cpp b/tools/pnnx/src/pass_onnx.cpp
new file mode 100644
index 00000000000..0c2f4d2e318
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx.cpp
@@ -0,0 +1,923 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+
+#include "onnx.pb.h"
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+
+#include <fstream>
+
+#include "ir.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+static float get_tensor_f(const onnx::TensorProto& tensor)
+{
+    int64_t numel = 1;
+    for (int k = 0; k < tensor.dims_size(); k++)
+    {
+        numel *= tensor.dims(k);
+    }
+
+    if (numel != 1)
+    {
+        fprintf(stderr, "get_tensor_f numel %ld\n", numel);
+    }
+
+    if (tensor.data_type() == onnx::TensorProto::FLOAT)
+    {
+        if (tensor.has_raw_data())
+        {
+            // assert tensor.raw_data().size() == 4
+            return ((float*)tensor.raw_data().data())[0];
+        }
+
+        // assert tensor.float_data().size() == 1
+        return tensor.float_data().at(0);
+    }
+
+    // fatal error
+    fprintf(stderr, "get_tensor_f failed\n");
+    return 0.f;
+}
+
+static std::vector<float> get_tensor_af(const onnx::TensorProto& tensor)
+{
+    if (tensor.dims_size() != 1)
+    {
+        fprintf(stderr, "get_tensor_af dims_size %d\n", (int)tensor.dims_size());
+    }
+
+    const int64_t numel = tensor.dims(0);
+
+    if (tensor.data_type() == onnx::TensorProto::FLOAT)
+    {
+        const float* p = tensor.has_raw_data() ? (float*)tensor.raw_data().data() : tensor.float_data().data();
+        std::vector<float> af(numel);
+        memcpy(af.data(), p, sizeof(float) * numel);
+        return af;
+    }
+
+    // fatal error
+    fprintf(stderr, "get_tensor_af failed\n");
+    return std::vector<float>();
+}
+
+static int64_t get_tensor_i(const onnx::TensorProto& tensor)
+{
+    int64_t numel = 1;
+    for (int k = 0; k < tensor.dims_size(); k++)
+    {
+        numel *= tensor.dims(k);
+    }
+
+    if (numel != 1)
+    {
+        fprintf(stderr, "get_tensor_i numel %ld\n", numel);
+    }
+
+    if (tensor.data_type() == onnx::TensorProto::INT32)
+    {
+        if (tensor.has_raw_data())
+        {
+            // assert tensor.raw_data().size() == 4
+            return ((int*)tensor.raw_data().data())[0];
+        }
+
+        // assert tensor.int32_data().size() == 1
+        return tensor.int32_data().at(0);
+    }
+
+    if (tensor.data_type() == onnx::TensorProto::INT64)
+    {
+        if (tensor.has_raw_data())
+        {
+            // assert tensor.raw_data().size() == 8
+            return ((int64_t*)tensor.raw_data().data())[0];
+        }
+
+        // assert tensor.int64_data().size() == 1
+        return tensor.int64_data().at(0);
+    }
+
+    // fatal error
+    fprintf(stderr, "get_tensor_i failed\n");
+    return 0;
+}
+
+static std::vector<int64_t> get_tensor_ai(const onnx::TensorProto& tensor)
+{
+    if (tensor.dims_size() != 1)
+    {
+        fprintf(stderr, "get_tensor_af dims_size %d\n", (int)tensor.dims_size());
+    }
+
+    const int64_t numel = tensor.dims(0);
+
+    if (tensor.data_type() == onnx::TensorProto::INT32)
+    {
+        const int* p = tensor.has_raw_data() ? (int*)tensor.raw_data().data() : tensor.int32_data().data();
+        std::vector<int64_t> ai(numel);
+        for (int i = 0; i < numel; i++)
+            ai[i] = p[i];
+        return ai;
+    }
+
+    if (tensor.data_type() == onnx::TensorProto::INT64)
+    {
+        const int64_t* p = tensor.has_raw_data() ? (int64_t*)tensor.raw_data().data() : tensor.int64_data().data();
+        std::vector<int64_t> ai(numel);
+        memcpy(ai.data(), p, sizeof(int64_t) * numel);
+        return ai;
+    }
+
+    // fatal error
+    fprintf(stderr, "get_tensor_ai failed\n");
+    return std::vector<int64_t>();
+}
+
+float OnnxAttributeProxy::value_f() const
+{
+    if (attr.type() == onnx::AttributeProto::FLOAT)
+    {
+        return attr.f();
+    }
+
+    if (attr.type() == onnx::AttributeProto::TENSOR)
+    {
+        return get_tensor_f(attr.t());
+    }
+
+    fprintf(stderr, "OnnxAttributeProxy value_f failed\n");
+    return 0.f;
+}
+
+int64_t OnnxAttributeProxy::value_i() const
+{
+    if (attr.type() == onnx::AttributeProto::INT)
+    {
+        return attr.i();
+    }
+
+    if (attr.type() == onnx::AttributeProto::TENSOR)
+    {
+        return get_tensor_i(attr.t());
+    }
+
+    fprintf(stderr, "OnnxAttributeProxy value_i failed\n");
+    return 0;
+}
+
+std::string OnnxAttributeProxy::value_s() const
+{
+    if (attr.type() != onnx::AttributeProto::STRING)
+        fprintf(stderr, "OnnxAttributeProxy value_s failed\n");
+
+    return attr.s();
+}
+
+std::vector<float> OnnxAttributeProxy::value_fs() const
+{
+    if (attr.type() == onnx::AttributeProto::FLOATS)
+    {
+        const int size = attr.floats().size();
+        std::vector<float> fs(size);
+        for (int i = 0; i < size; i++)
+        {
+            fs[i] = attr.floats().at(i);
+        }
+        return fs;
+    }
+
+    if (attr.type() == onnx::AttributeProto::TENSOR)
+    {
+        return get_tensor_af(attr.t());
+    }
+
+    fprintf(stderr, "OnnxAttributeProxy value_fs failed\n");
+    return std::vector<float>();
+}
+
+std::vector<int64_t> OnnxAttributeProxy::value_is() const
+{
+    if (attr.type() == onnx::AttributeProto::INTS)
+    {
+        const int size = attr.ints().size();
+        std::vector<int64_t> is(size);
+        for (int i = 0; i < size; i++)
+        {
+            is[i] = attr.ints().at(i);
+        }
+        return is;
+    }
+
+    if (attr.type() == onnx::AttributeProto::TENSOR)
+    {
+        return get_tensor_ai(attr.t());
+    }
+
+    fprintf(stderr, "OnnxAttributeProxy value_is failed\n");
+    return std::vector<int64_t>();
+}
+
+std::vector<std::string> OnnxAttributeProxy::value_ss() const
+{
+    if (attr.type() != onnx::AttributeProto::STRINGS)
+        fprintf(stderr, "OnnxAttributeProxy value_ss failed\n");
+
+    const int size = attr.strings().size();
+    std::vector<std::string> ss(size);
+    for (int i = 0; i < size; i++)
+    {
+        ss[i] = attr.strings().at(i);
+    }
+    return ss;
+}
+
+OnnxNodeProxy::OnnxNodeProxy(const onnx::NodeProto& _node)
+    : node(_node)
+{
+    // extract attribute info
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const std::string& name = node.attribute(i).name();
+        attributes.insert(std::make_pair(name, i));
+    }
+}
+
+bool OnnxNodeProxy::has_attribute(const std::string& name) const
+{
+    return attributes.count(name);
+}
+
+const OnnxAttributeProxy OnnxNodeProxy::attribute(const std::string& name) const
+{
+    int attribute_index = attributes.at(name);
+    return node.attribute(attribute_index);
+}
+
+OnnxFunctionProxy::OnnxFunctionProxy(const onnx::ModelProto& _model, const onnx::NodeProto& _caller, const onnx::FunctionProto& _function)
+    : model(_model), caller(_caller), function(_function)
+{
+    for (int i = 0; i < function.node_size(); i++)
+    {
+        const std::string& name = function.node(i).name();
+        named_nodes.insert(std::make_pair(name, i));
+
+        const std::string& type = function.node(i).op_type();
+        typed_nodes.insert(std::make_pair(type, i));
+    }
+
+    for (int i = 0; i < caller.input_size(); i++)
+    {
+        const std::string& function_argument = caller.input(i);
+
+        int initializer_index = -1;
+        for (int j = 0; j < model.graph().initializer_size(); j++)
+        {
+            if (model.graph().initializer(j).name() == function_argument)
+            {
+                initializer_index = j;
+                break;
+            }
+        }
+
+        const std::string& function_parameter = function.input(i);
+        initializers.insert(std::make_pair(function_parameter, initializer_index));
+    }
+}
+
+bool OnnxFunctionProxy::has_typed_node(const std::string& type) const
+{
+    return typed_nodes.count(type);
+}
+
+bool OnnxFunctionProxy::has_named_node(const std::string& name) const
+{
+    return named_nodes.count(name);
+}
+
+const OnnxNodeProxy OnnxFunctionProxy::typed_node(const std::string& type) const
+{
+    int node_index = typed_nodes.at(type);
+    return function.node(node_index);
+}
+
+const OnnxNodeProxy OnnxFunctionProxy::named_node(const std::string& name) const
+{
+    int node_index = named_nodes.at(name);
+    return function.node(node_index);
+}
+
+const OnnxNodeProxy OnnxFunctionProxy::find_producer(const std::string& name) const
+{
+    // find Constant node which produces name
+    for (int i = 0; i < function.node_size(); i++)
+    {
+        const onnx::NodeProto& node = function.node(i);
+        for (int j = 0; j < node.output_size(); j++)
+        {
+            if (node.output(j) == name)
+            {
+                return node;
+            }
+        }
+    }
+
+    // should never reach here
+    return function.node(0);
+}
+
+bool OnnxFunctionProxy::has_initializer(const std::string& name) const
+{
+    return initializers.count(name);
+}
+
+const onnx::TensorProto& OnnxFunctionProxy::initializer(const std::string& name) const
+{
+    int initializer_index = initializers.at(name);
+    return model.graph().initializer(initializer_index);
+}
+
+OnnxModelProxy::OnnxModelProxy(const onnx::ModelProto& _model)
+    : model(_model)
+{
+    for (int i = 0; i < model.graph().node_size(); i++)
+    {
+        const std::string& name = model.graph().node(i).name();
+        nodes.insert(std::make_pair(name, i));
+
+        for (int j = 0; j < model.functions_size(); j++)
+        {
+            const std::string& function_name = model.functions(j).name();
+            if (function_name == model.graph().node(i).op_type())
+            {
+                functions.insert(std::make_pair(function_name + name, j));
+            }
+        }
+    }
+
+    for (int i = 0; i < model.graph().input_size(); i++)
+    {
+        const std::string& name = model.graph().input(i).name();
+        valueinfos.insert(std::make_pair(name, -1));
+    }
+    for (int i = 0; i < model.graph().output_size(); i++)
+    {
+        const std::string& name = model.graph().output(i).name();
+        valueinfos.insert(std::make_pair(name, -2));
+    }
+
+    for (int i = 0; i < model.graph().value_info_size(); i++)
+    {
+        const std::string& name = model.graph().value_info(i).name();
+        valueinfos.insert(std::make_pair(name, i));
+    }
+
+    for (int i = 0; i < model.graph().initializer_size(); i++)
+    {
+        const std::string& name = model.graph().initializer(i).name();
+        initializers.insert(std::make_pair(name, i));
+    }
+}
+
+bool OnnxModelProxy::has_node(const std::string& name) const
+{
+    return nodes.count(name);
+}
+
+const OnnxNodeProxy OnnxModelProxy::node(const std::string& name) const
+{
+    int node_index = nodes.at(name);
+    return model.graph().node(node_index);
+}
+
+bool OnnxModelProxy::has_function(const std::string& name, const std::string& caller) const
+{
+    return functions.count(name + caller);
+}
+
+const OnnxFunctionProxy OnnxModelProxy::function(const std::string& name, const std::string& caller) const
+{
+    int function_index = functions.at(name + caller);
+    return OnnxFunctionProxy(model, node(caller).node, model.functions(function_index));
+}
+
+bool OnnxModelProxy::has_valueinfo(const std::string& name) const
+{
+    return valueinfos.count(name);
+}
+
+const onnx::ValueInfoProto& OnnxModelProxy::valueinfo(const std::string& name) const
+{
+    int valueinfo_index = valueinfos.at(name);
+    if (valueinfo_index == -1)
+    {
+        for (int i = 0; i < model.graph().input_size(); i++)
+        {
+            if (model.graph().input(i).name() == name)
+                return model.graph().input(i);
+        }
+    }
+    if (valueinfo_index == -2)
+    {
+        for (int i = 0; i < model.graph().output_size(); i++)
+        {
+            if (model.graph().output(i).name() == name)
+                return model.graph().output(i);
+        }
+    }
+
+    return model.graph().value_info(valueinfo_index);
+}
+
+bool OnnxModelProxy::has_initializer(const std::string& name) const
+{
+    return initializers.count(name);
+}
+
+const onnx::TensorProto& OnnxModelProxy::initializer(const std::string& name) const
+{
+    int initializer_index = initializers.at(name);
+    return model.graph().initializer(initializer_index);
+}
+
+FuseFunctionPass::~FuseFunctionPass()
+{
+}
+
+void FuseFunctionPass::write(Operator* /*op*/, const OnnxFunctionProxy& /*function*/) const
+{
+}
+
+static std::vector<const FuseFunctionPass*> g_global_pnnx_fuse_function_passes;
+
+const std::vector<const FuseFunctionPass*>& get_global_pnnx_fuse_function_passes()
+{
+    return g_global_pnnx_fuse_function_passes;
+}
+
+FuseFunctionPassRegister::FuseFunctionPassRegister(const FuseFunctionPass* _pass)
+    : pass(_pass)
+{
+    g_global_pnnx_fuse_function_passes.push_back(pass);
+}
+
+FuseFunctionPassRegister::~FuseFunctionPassRegister()
+{
+    delete pass;
+}
+
+} // namespace onnx2pnnx
+
+static bool string_starts_with(const std::string& s, const std::string& s2)
+{
+    return strncmp(s.c_str(), s2.c_str(), s2.size()) == 0;
+}
+
+static void fuse_list_unpack(Graph& graph)
+{
+    // prim::Constant + aten::getitem ...  ->  prim::ListUnpack
+
+    while (1)
+    {
+        bool matched = false;
+
+        for (size_t i = 0; i < graph.ops.size(); i++)
+        {
+            Operator* op = graph.ops[i];
+
+            if (op->type != "aten::getitem")
+                continue;
+
+            Operand* op_in = op->inputs[0];
+
+            const int item_count = (int)op_in->consumers.size();
+
+            std::vector<Operator*> getitem_ops(item_count);
+
+            Operator* cur = op;
+
+            bool full_getitem = true;
+            for (Operator* op2 : op_in->consumers)
+            {
+                if (op2->type != "aten::getitem")
+                {
+                    fprintf(stderr, "unbalanced getitem\n");
+                    full_getitem = false;
+                    break;
+                }
+
+                int gi = op2->inputs[1]->producer->params.at("value").i;
+                getitem_ops[gi] = op2;
+
+                if (std::find(graph.ops.begin(), graph.ops.end(), op2) < std::find(graph.ops.begin(), graph.ops.end(), cur))
+                    cur = op2;
+            }
+
+            if (!full_getitem)
+                continue;
+
+            matched = true;
+
+            // delete all getitem ops and replace with ListUnpack
+            Operator* op_list_unpack = graph.new_operator_before("prim::ListUnpack", op->name, cur);
+
+            op_list_unpack->inputs.push_back(op_in);
+            for (auto op2 : getitem_ops)
+            {
+                op_in->remove_consumer(op2);
+            }
+            op_in->consumers.push_back(op_list_unpack);
+
+            op_list_unpack->outputs.resize(getitem_ops.size());
+            for (size_t j = 0; j < getitem_ops.size(); j++)
+            {
+                op_list_unpack->outputs[j] = getitem_ops[j]->outputs[0];
+                getitem_ops[j]->outputs[0]->producer = op_list_unpack;
+            }
+
+            for (auto op2 : getitem_ops)
+            {
+                op2->inputs[1]->remove_consumer(op2);
+
+                graph.ops.erase(std::find(graph.ops.begin(), graph.ops.end(), op2));
+                delete op2;
+            }
+
+            break;
+        }
+
+        if (!matched)
+            break;
+    }
+}
+
+void pass_onnx(const onnx::ModelProto& model, Graph& pnnx_graph)
+{
+    onnx2pnnx::OnnxModelProxy modelproxy(model);
+
+    const onnx::GraphProto& graph = model.graph();
+
+    for (int i = 0; i < graph.input_size(); i++)
+    {
+        const std::string& output = graph.input(i).name();
+
+        Operator* op = pnnx_graph.new_operator("pnnx.Input", output);
+
+        const onnx::ValueInfoProto& value = modelproxy.valueinfo(output);
+
+        Operand* op_out = pnnx_graph.new_operand(value);
+
+        op_out->producer = op;
+        op->outputs.push_back(op_out);
+    }
+
+    for (int i = 0; i < graph.node_size(); i++)
+    {
+        const onnx::NodeProto& node = graph.node(i);
+
+        const std::string& op_type = node.op_type();
+
+        std::string sim_op_type;
+
+        if (node.domain().empty())
+        {
+            // native onnx op
+            sim_op_type = op_type;
+
+            if (op_type == "SequenceConstruct")
+            {
+                sim_op_type = "prim::ListConstruct";
+            }
+
+            if (op_type == "Slice")
+            {
+                sim_op_type = "aten::slice";
+            }
+
+            if (op_type == "Transpose")
+            {
+                sim_op_type = "aten::permute";
+            }
+        }
+        else if (string_starts_with(op_type, "aten_"))
+        {
+            // aten_view
+            sim_op_type = std::string("aten::") + op_type.substr(5);
+        }
+        else if (string_starts_with(op_type, "_aten_"))
+        {
+            // _aten_roll_shift_and_dim_onnx
+            sim_op_type = std::string("aten::") + op_type.substr(6);
+        }
+        else if (string_starts_with(op_type, "prims_"))
+        {
+            // prims_convert_element_type
+            sim_op_type = std::string("prim::") + op_type.substr(6);
+        }
+        else if (string_starts_with(op_type, "nn_"))
+        {
+            // torch_nn_modules_conv_Conv2d                 _conv1_1
+            sim_op_type = op_type;
+            // nn_Conv2d_i -> nn.Conv2d
+            sim_op_type[2] = '.';
+            if (sim_op_type.find_first_of('_') != std::string::npos)
+                sim_op_type = sim_op_type.substr(0, sim_op_type.find_first_of('_'));
+        }
+        else
+        {
+            // custom function
+            sim_op_type = std::string("custom_op.") + op_type;
+        }
+
+        // fprintf(fp, "%-24s %-8s", sim_op_type.c_str(), node.name().c_str());
+
+        Operator* op = pnnx_graph.new_operator(sim_op_type, node.name());
+
+        // bool is_function = modelproxy.has_function(node.op_type(), node.name());
+
+        bool is_function_op = string_starts_with(sim_op_type, "nn.") || string_starts_with(sim_op_type, "custom_op.");
+
+        bool is_aten_op = string_starts_with(sim_op_type, "aten::");
+
+        bool is_prim_op = string_starts_with(sim_op_type, "prim::");
+
+        for (int j = 0; j < node.input_size(); j++)
+        {
+            const std::string& input = node.input(j);
+
+            if (modelproxy.has_initializer(input))
+            {
+                // skip function weight
+                if (is_function_op)
+                    continue;
+
+                const onnx::TensorProto& tensor = modelproxy.initializer(input);
+
+                int64_t numel = 1;
+                for (int k = 0; k < tensor.dims_size(); k++)
+                {
+                    numel *= tensor.dims(k);
+                }
+
+                if (numel == 1)
+                {
+                    Operator* op_const = pnnx_graph.new_operator_before("prim::Constant", input, op);
+
+                    Operand* op_const_out = pnnx_graph.new_operand(input);
+
+                    op_const_out->producer = op_const;
+                    op_const->outputs.push_back(op_const_out);
+
+                    if (tensor.data_type() == onnx::TensorProto::INT32)
+                    {
+                        if (tensor.has_raw_data())
+                        {
+                            // assert tensor.raw_data().size() == 4
+                            op_const->params["value"] = ((int*)tensor.raw_data().data())[0];
+                        }
+                        else
+                        {
+                            // assert tensor.int32_data().size() == 1
+                            op_const->params["value"] = tensor.int32_data().at(0);
+                        }
+                    }
+                    else if (tensor.data_type() == onnx::TensorProto::INT64)
+                    {
+                        int64_t i64;
+                        if (tensor.has_raw_data())
+                        {
+                            // assert tensor.raw_data().size() == 8
+                            i64 = ((int64_t*)tensor.raw_data().data())[0];
+                        }
+                        else
+                        {
+                            // assert tensor.int64_data().size() == 1
+                            i64 = tensor.int64_data().at(0);
+                        }
+                        if (i64 == std::numeric_limits<int64_t>::max()) i64 = INT_MAX;
+                        if (i64 == std::numeric_limits<int64_t>::min()) i64 = INT_MIN;
+                        op_const->params["value"] = (int)i64;
+                    }
+                    else if (tensor.data_type() == onnx::TensorProto::FLOAT)
+                    {
+                        if (tensor.has_raw_data())
+                        {
+                            // assert tensor.raw_data().size() == 4
+                            op_const->params["value"] = ((float*)tensor.raw_data().data())[0];
+                        }
+                        else
+                        {
+                            // assert tensor.float_data().size() == 1
+                            op_const->params["value"] = tensor.float_data().at(0);
+                        }
+                    }
+                    else if (tensor.data_type() == onnx::TensorProto::BOOL)
+                    {
+                        if (tensor.has_raw_data())
+                        {
+                            // assert tensor.raw_data().size() == 2
+                            op_const->params["value"] = ((uint16_t*)tensor.raw_data().data())[0] ? true : false;
+                        }
+                        else
+                        {
+                            // assert tensor.int32_data().size() == 1
+                            op_const->params["value"] = tensor.int32_data().at(0) ? true : false;
+                        }
+                    }
+                    else
+                    {
+                        fprintf(stderr, "unknown constant scalar type %d\n", (int)tensor.data_type());
+                    }
+                }
+                else if (is_aten_op && tensor.dims_size() == 1 && (tensor.data_type() == onnx::TensorProto::INT32 || tensor.data_type() == onnx::TensorProto::INT64))
+                {
+                    // create list expression
+                    Operator* op_const = pnnx_graph.new_operator_before("pnnx.Expression", input, op);
+
+                    Operand* op_const_out = pnnx_graph.new_operand(input);
+
+                    op_const_out->producer = op_const;
+                    op_const->outputs.push_back(op_const_out);
+
+                    const int list_size = tensor.dims(0);
+                    if (tensor.data_type() == onnx::TensorProto::INT32)
+                    {
+                        std::vector<int> ai(list_size);
+                        if (tensor.has_raw_data())
+                        {
+                            // assert tensor.raw_data().size() == 4 * list_size
+                            memcpy((void*)ai.data(), (int*)tensor.raw_data().data(), sizeof(int) * list_size);
+                        }
+                        else
+                        {
+                            // assert tensor.int32_data().size() == list_size
+                            memcpy((void*)ai.data(), tensor.int32_data().data(), sizeof(int) * list_size);
+                        }
+                        std::string expr = "[";
+                        for (int k = 0; k < (int)ai.size(); k++)
+                        {
+                            expr += std::to_string(ai[k]);
+                            if (k != (int)ai.size() - 1)
+                                expr += ",";
+                        }
+                        expr += "]";
+                        op_const->params["expr"] = expr;
+                    }
+                    else if (tensor.data_type() == onnx::TensorProto::INT64)
+                    {
+                        std::vector<int64_t> ai(list_size);
+                        if (tensor.has_raw_data())
+                        {
+                            // assert tensor.raw_data().size() == 8 * list_size
+                            memcpy((void*)ai.data(), (int64_t*)tensor.raw_data().data(), sizeof(int64_t) * list_size);
+                        }
+                        else
+                        {
+                            // assert tensor.int64_data().size() == list_size
+                            memcpy((void*)ai.data(), tensor.int64_data().data(), sizeof(int64_t) * list_size);
+                        }
+                        std::string expr = "[";
+                        for (int k = 0; k < (int)ai.size(); k++)
+                        {
+                            int64_t i64 = ai[k];
+                            if (i64 == std::numeric_limits<int64_t>::max()) i64 = INT_MAX;
+                            if (i64 == std::numeric_limits<int64_t>::min()) i64 = INT_MIN;
+                            expr += std::to_string(i64);
+                            if (k != (int)ai.size() - 1)
+                                expr += ",";
+                        }
+                        expr += "]";
+                        op_const->params["expr"] = expr;
+                    }
+                }
+                else
+                {
+                    // create constant for functions
+                    Operator* op_const = pnnx_graph.new_operator_before("pnnx.Attribute", input, op);
+
+                    Operand* op_const_out = pnnx_graph.new_operand(tensor);
+
+                    op_const_out->producer = op_const;
+                    op_const->outputs.push_back(op_const_out);
+
+                    op_const->attrs["data"] = tensor;
+                }
+            }
+
+            Operand* op_in = pnnx_graph.get_operand(input);
+
+            op_in->consumers.push_back(op);
+            op->inputs.push_back(op_in);
+        }
+
+        for (int j = 0; j < node.output_size(); j++)
+        {
+            const std::string& output = node.output(j);
+
+            Operand* op_out = 0;
+
+            if (modelproxy.has_valueinfo(output))
+            {
+                const onnx::ValueInfoProto& value = modelproxy.valueinfo(output);
+                op_out = pnnx_graph.new_operand(value);
+            }
+            else
+            {
+                op_out = pnnx_graph.new_operand(output);
+            }
+
+            op_out->producer = op;
+            op->outputs.push_back(op_out);
+        }
+
+        if (is_function_op)
+        {
+            const onnx2pnnx::OnnxFunctionProxy function = modelproxy.function(node.op_type(), node.name());
+
+            for (const auto& ow : onnx2pnnx::get_global_pnnx_fuse_function_passes())
+            {
+                if (sim_op_type != ow->match_type_str())
+                    continue;
+
+                op->type = ow->type_str();
+                ow->write(op, function);
+
+                break;
+            }
+        }
+        else if (is_aten_op)
+        {
+            // extract attributes
+            for (int j = 0; j < node.attribute_size(); j++)
+            {
+                const onnx::AttributeProto& attr = node.attribute(j);
+
+                op->params[attr.name()] = attr;
+            }
+
+            if (op_type == "Slice")
+            {
+                // data start end dim step -> data dim start end step
+                op->inputnames = {"data", "dim", "start", "end", "step"};
+                op->inputs = {op->inputs[0], op->inputs[3], op->inputs[1], op->inputs[2], op->inputs[4]};
+            }
+
+            if (op_type == "Transpose")
+            {
+                op->params["dims"] = op->params["perm"];
+                op->params.erase("perm");
+            }
+        }
+        else if (is_prim_op)
+        {
+            // do nothing :)
+        }
+        else
+        {
+            // onnx native op, extract attributes
+            for (int j = 0; j < node.attribute_size(); j++)
+            {
+                const onnx::AttributeProto& attr = node.attribute(j);
+
+                op->params[attr.name()] = attr;
+            }
+        }
+    }
+
+    for (int i = 0; i < graph.output_size(); i++)
+    {
+        const std::string& input = graph.output(i).name();
+
+        Operator* op = pnnx_graph.new_operator("pnnx.Output", input);
+
+        Operand* op_in = pnnx_graph.get_operand(input);
+
+        op_in->consumers.push_back(op);
+        op->inputs.push_back(op_in);
+    }
+
+    // post process
+    fuse_list_unpack(pnnx_graph);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx.h b/tools/pnnx/src/pass_onnx.h
new file mode 100644
index 00000000000..f087dfa4315
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx.h
@@ -0,0 +1,181 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef PNNX_PASS_ONNX_H
+#define PNNX_PASS_ONNX_H
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+namespace onnx {
+class AttributeProto;
+class FunctionProto;
+class ModelProto;
+class NodeProto;
+class TensorProto;
+class ValueInfoProto;
+} // namespace onnx
+
+namespace pnnx {
+
+class Operator;
+class Graph;
+
+namespace onnx2pnnx {
+
+class OnnxAttributeProxy
+{
+public:
+    OnnxAttributeProxy(const onnx::AttributeProto& _attr)
+        : attr(_attr)
+    {
+    }
+
+    operator float() const
+    {
+        return value_f();
+    }
+    operator int64_t() const
+    {
+        return value_i();
+    }
+    operator std::string() const
+    {
+        return value_s();
+    }
+    operator std::vector<float>() const
+    {
+        return value_fs();
+    }
+    operator std::vector<int64_t>() const
+    {
+        return value_is();
+    }
+    operator std::vector<std::string>() const
+    {
+        return value_ss();
+    }
+
+    float value_f() const;
+    int64_t value_i() const;
+    std::string value_s() const;
+    std::vector<float> value_fs() const;
+    std::vector<int64_t> value_is() const;
+    std::vector<std::string> value_ss() const;
+
+public:
+    const onnx::AttributeProto& attr;
+};
+
+class OnnxNodeProxy
+{
+public:
+    OnnxNodeProxy(const onnx::NodeProto& _node);
+
+    bool has_attribute(const std::string& name) const;
+    const OnnxAttributeProxy attribute(const std::string& name) const;
+
+public:
+    const onnx::NodeProto& node;
+
+protected:
+    std::unordered_map<std::string, int> attributes;
+};
+
+class OnnxFunctionProxy
+{
+public:
+    OnnxFunctionProxy(const onnx::ModelProto& _model, const onnx::NodeProto& _caller, const onnx::FunctionProto& _function);
+
+    bool has_typed_node(const std::string& type) const;
+    bool has_named_node(const std::string& name) const;
+    const OnnxNodeProxy typed_node(const std::string& type) const;
+    const OnnxNodeProxy named_node(const std::string& name) const;
+
+    const OnnxNodeProxy find_producer(const std::string& name) const;
+
+    bool has_initializer(const std::string& name) const;
+    const onnx::TensorProto& initializer(const std::string& name) const;
+
+public:
+    const onnx::ModelProto& model;
+    const onnx::NodeProto& caller;
+    const onnx::FunctionProto& function;
+
+protected:
+    std::unordered_map<std::string, int> typed_nodes;
+    std::unordered_map<std::string, int> named_nodes;
+    std::unordered_map<std::string, int> initializers;
+};
+
+class OnnxModelProxy
+{
+public:
+    OnnxModelProxy(const onnx::ModelProto& _model);
+
+    bool has_node(const std::string& name) const;
+    const OnnxNodeProxy node(const std::string& name) const;
+
+    bool has_function(const std::string& name, const std::string& caller) const;
+    const OnnxFunctionProxy function(const std::string& name, const std::string& caller) const;
+
+    bool has_valueinfo(const std::string& name) const;
+    const onnx::ValueInfoProto& valueinfo(const std::string& name) const;
+
+    bool has_initializer(const std::string& name) const;
+    const onnx::TensorProto& initializer(const std::string& name) const;
+
+public:
+    const onnx::ModelProto& model;
+
+protected:
+    std::unordered_map<std::string, int> nodes;
+    std::unordered_map<std::string, int> functions;
+    std::unordered_map<std::string, int> valueinfos;
+    std::unordered_map<std::string, int> initializers;
+};
+
+class FuseFunctionPass
+{
+public:
+    virtual ~FuseFunctionPass();
+
+    virtual const char* match_type_str() const = 0;
+
+    virtual const char* type_str() const = 0;
+
+    virtual void write(Operator* op, const OnnxFunctionProxy& function) const;
+};
+
+class FuseFunctionPassRegister
+{
+public:
+    FuseFunctionPassRegister(const FuseFunctionPass* pass);
+    ~FuseFunctionPassRegister();
+    const FuseFunctionPass* pass;
+};
+
+const std::vector<const FuseFunctionPass*>& get_global_pnnx_fuse_function_passes();
+
+#define REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(CLASS) \
+    static FuseFunctionPassRegister g_global_pnnx_fusefunctionpass_##CLASS##_register(new CLASS);
+
+} // namespace onnx2pnnx
+
+void pass_onnx(const onnx::ModelProto& model, Graph& pnnx_graph);
+
+} // namespace pnnx
+
+#endif // PNNX_PASS_ONNX_H
diff --git a/tools/pnnx/src/pass_onnx/canonicalize.cpp b/tools/pnnx/src/pass_onnx/canonicalize.cpp
new file mode 100644
index 00000000000..9698c9f73f7
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/canonicalize.cpp
@@ -0,0 +1,298 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "canonicalize.h"
+
+#include <map>
+#include <string>
+#include <unordered_set>
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+static bool string_starts_with(const std::string& s, const std::string& s2)
+{
+    return strncmp(s.c_str(), s2.c_str(), s2.size()) == 0;
+}
+
+void canonicalize(onnx::ModelProto& model)
+{
+    // collect initializers
+    std::unordered_set<std::string> initializers;
+    {
+        const onnx::GraphProto& graph = model.graph();
+        for (int i = 0; i < graph.initializer_size(); i++)
+        {
+            initializers.insert(graph.initializer(i).name());
+        }
+    }
+
+    onnx::GraphProto* graph = model.mutable_graph();
+
+    std::map<std::string, std::string> function_remap;
+
+    std::map<std::string, std::string> input_output_remap;
+    int input_output_index = 0;
+
+    // canonicalize graph input output
+    {
+        for (int i = 0; i < graph->input_size(); i++)
+        {
+            onnx::ValueInfoProto* input = graph->mutable_input(i);
+
+            std::string new_name = std::string("in") + std::to_string(i);
+
+            // fprintf(stderr, "%s -> %s\n", input->name().c_str(), new_name.c_str());
+            input_output_remap[input->name()] = new_name;
+            input->set_name(new_name);
+        }
+        for (int i = 0; i < graph->output_size(); i++)
+        {
+            onnx::ValueInfoProto* output = graph->mutable_output(i);
+
+            std::string new_name = std::string("out") + std::to_string(i);
+
+            // fprintf(stderr, "%s -> %s\n", output->name().c_str(), new_name.c_str());
+            input_output_remap[output->name()] = new_name;
+            output->set_name(new_name);
+        }
+    }
+
+    for (int i = 0; i < graph->node_size(); i++)
+    {
+        onnx::NodeProto* node = graph->mutable_node(i);
+
+        // simplify type
+        {
+            const std::string& op_type = node->op_type();
+
+            if (node->domain().empty())
+            {
+                // native onnx op
+                // Constant
+                node->set_name(op_type + "_" + std::to_string(i));
+            }
+            else if (string_starts_with(op_type, "aten_"))
+            {
+                // aten_view
+                node->set_name(op_type.substr(5) + "_" + std::to_string(i));
+            }
+            else if (string_starts_with(op_type, "_aten_"))
+            {
+                node->set_name(op_type.substr(6) + "_" + std::to_string(i));
+            }
+            else if (string_starts_with(op_type, "prims_"))
+            {
+                // prims_convert_element_type
+                node->set_name(op_type.substr(6) + "_" + std::to_string(i));
+            }
+            else if (string_starts_with(op_type, "torch_nn_modules_") && !string_starts_with(op_type, "torch_nn_modules_container_"))
+            {
+                // torch_nn_modules_conv_Conv2d                 _conv1_1
+                // torch_nn_modules_batchnorm_BatchNorm2d       _bn1_1
+                // torch_nn_modules_pooling_MaxPool2d           _maxpool_1_3
+                // torch_nn_modules_linear_Linear               _fc_1
+
+                if (function_remap.find(op_type) != function_remap.end())
+                {
+                    node->set_op_type(function_remap.at(op_type));
+                }
+                else
+                {
+                    // torch_nn_modules_conv_Conv2d_xyz -> nn_Conv2d_i
+                    char nn_type[256];
+                    int nconsumed = 0;
+                    sscanf(op_type.c_str() + sizeof("torch_nn_modules_") - 1, "%*[^_]_%255[^_]_%n", nn_type, &nconsumed);
+
+                    std::string new_op_type = std::string("nn_") + nn_type + "_" + std::to_string(i);
+
+                    function_remap[op_type] = new_op_type;
+
+                    node->set_op_type(new_op_type);
+                }
+                node->set_name(node->op_type().substr(3));
+            }
+            else
+            {
+                // unknown module ?
+                fprintf(stderr, "unexpected op_type %s\n", op_type.c_str());
+                node->set_name(std::string("op_") + std::to_string(i));
+            }
+        }
+
+        // canonicalize name
+        // node->set_name(std::string("op_") + std::to_string(i));
+
+        // canonicalize node input output
+        {
+            for (int j = 0; j < node->input_size(); j++)
+            {
+                const std::string& node_input = node->input(j);
+
+                // some input/output may have empty name, it causes trouble, skip it
+                if (node_input.empty())
+                    continue;
+
+                // skip initializer
+                if (initializers.find(node_input) != initializers.end())
+                    continue;
+
+                if (input_output_remap.find(node_input) != input_output_remap.end())
+                {
+                    node->set_input(j, input_output_remap.at(node_input));
+                }
+                else
+                {
+                    // fprintf(stderr, "%s -> %s\n", node_input.c_str(), std::to_string(input_output_index).c_str());
+
+                    input_output_remap[node_input] = std::to_string(input_output_index);
+                    node->set_input(j, std::to_string(input_output_index));
+                    input_output_index++;
+                }
+            }
+            for (int j = 0; j < node->output_size(); j++)
+            {
+                const std::string& node_output = node->output(j);
+
+                // some input/output may have empty name, it causes trouble, skip it
+                if (node_output.empty())
+                    continue;
+
+                if (input_output_remap.find(node_output) != input_output_remap.end())
+                {
+                    node->set_output(j, input_output_remap.at(node_output));
+                }
+                else
+                {
+                    // fprintf(stderr, "%s -> %s\n", node_output.c_str(), std::to_string(input_output_index).c_str());
+
+                    input_output_remap[node_output] = std::to_string(input_output_index);
+                    node->set_output(j, std::to_string(input_output_index));
+                    input_output_index++;
+                }
+            }
+        }
+    }
+
+    // canonicalize all functions
+    for (int i = 0; i < model.functions_size(); i++)
+    {
+        onnx::FunctionProto* function = model.mutable_functions(i);
+
+        if (function_remap.find(function->name()) != function_remap.end())
+        {
+            function->set_name(function_remap.at(function->name()));
+        }
+
+        if (!string_starts_with(function->name(), "nn_"))
+            continue;
+
+        // simplify function input
+        int function_input_index = 0;
+        int function_output_index = 0;
+        std::map<std::string, std::string> function_input_output_remap;
+        for (int j = 0; j < function->input_size(); j++)
+        {
+            const std::string& func_input = function->input(j);
+
+            if (initializers.find(func_input) == initializers.end())
+            {
+                // input tensor
+                std::string new_name = std::string("in") + std::to_string(function_input_index);
+                function_input_output_remap[func_input] = new_name;
+                function->set_input(j, new_name);
+                function_input_index++;
+            }
+            else
+            {
+                // weights
+                // layer2.0.bn1.running_mean
+                size_t last_dot = func_input.find_last_of('.');
+                if (last_dot != std::string::npos)
+                {
+                    std::string new_name = func_input.substr(last_dot + 1);
+                    function_input_output_remap[func_input] = new_name;
+                    function->set_input(j, new_name);
+                }
+            }
+        }
+        for (int j = 0; j < function->output_size(); j++)
+        {
+            const std::string& func_output = function->output(j);
+
+            // output tensor
+            std::string new_name = std::string("out") + std::to_string(function_output_index);
+            function_input_output_remap[func_output] = new_name;
+            function->set_output(j, new_name);
+            function_output_index++;
+        }
+
+        for (int j = 0; j < function->node_size(); j++)
+        {
+            onnx::NodeProto* node = function->mutable_node(j);
+
+            for (int k = 0; k < node->input_size(); k++)
+            {
+                const std::string& input = node->input(k);
+
+                if (function_input_output_remap.find(input) != function_input_output_remap.end())
+                {
+                    node->set_input(k, function_input_output_remap[input]);
+                }
+            }
+            for (int k = 0; k < node->output_size(); k++)
+            {
+                const std::string& output = node->output(k);
+
+                if (function_input_output_remap.find(output) != function_input_output_remap.end())
+                {
+                    node->set_output(k, function_input_output_remap[output]);
+                }
+            }
+        }
+    }
+
+    // canonicalize all initializers
+    // for (int i = 0; i < graph->initializer_size(); i++)
+    // {
+    //     onnx::TensorProto* initializer = graph->mutable_initializer(i);
+    //
+    //     if (input_output_remap.find(initializer->name()) == input_output_remap.end())
+    //     {
+    //         // skip initializers inside module function
+    //         continue;
+    //     }
+    //
+    //     initializer->set_name(input_output_remap.at(initializer->name()));
+    // }
+
+    // canonicalize all values
+    for (int i = 0; i < graph->value_info_size(); i++)
+    {
+        onnx::ValueInfoProto* value = graph->mutable_value_info(i);
+
+        if (input_output_remap.find(value->name()) == input_output_remap.end())
+        {
+            // skip values inside module function
+            continue;
+        }
+
+        value->set_name(input_output_remap.at(value->name()));
+    }
+}
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/canonicalize.h b/tools/pnnx/src/pass_onnx/canonicalize.h
new file mode 100644
index 00000000000..a24ad86a9fd
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/canonicalize.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+void canonicalize(onnx::ModelProto& model);
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/dead_code_elimination.cpp b/tools/pnnx/src/pass_onnx/dead_code_elimination.cpp
new file mode 100644
index 00000000000..dd54f0f2ffc
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/dead_code_elimination.cpp
@@ -0,0 +1,265 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "dead_code_elimination.h"
+
+#include <string>
+#include <unordered_set>
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+void dead_code_elimination(onnx::ModelProto& model)
+{
+    // collect all nodes that have no links with graph outputs
+    std::vector<std::string> dead_outputs;
+    std::vector<int> dead_node_indexes;
+    {
+        const onnx::GraphProto& graph = model.graph();
+
+        std::unordered_set<std::string> live_inputs;
+        for (int i = 0; i < graph.output_size(); i++)
+        {
+            live_inputs.insert(graph.output(i).name());
+        }
+
+        for (int i = graph.node_size() - 1; i >= 0; i--)
+        {
+            const onnx::NodeProto& node = graph.node(i);
+
+            bool is_outputs_live = false;
+            for (int j = 0; j < node.output_size(); j++)
+            {
+                if (live_inputs.find(node.output(j)) != live_inputs.end())
+                {
+                    is_outputs_live = true;
+                    break;
+                }
+            }
+
+            if (is_outputs_live)
+            {
+                for (int j = 0; j < node.output_size(); j++)
+                {
+                    if (live_inputs.find(node.output(j)) == live_inputs.end())
+                    {
+                        dead_outputs.push_back(node.output(j));
+                    }
+                }
+
+                for (int j = 0; j < node.input_size(); j++)
+                {
+                    live_inputs.insert(node.input(j));
+                }
+            }
+            else
+            {
+                dead_node_indexes.push_back(i);
+            }
+        }
+    }
+
+    // eliminate dead nodes
+    {
+        onnx::GraphProto* graph = model.mutable_graph();
+
+        for (size_t i = 0; i < dead_node_indexes.size(); i++)
+        {
+            const int dead_node_index = dead_node_indexes[i];
+
+            //  ..... dni .......
+            const int graph_node_size = graph->node_size();
+            for (int j = dead_node_index; j < graph_node_size - 1; j++)
+            {
+                graph->mutable_node()->SwapElements(j, j + 1);
+            }
+
+            //  ..... ....... dni
+            graph->mutable_node()->RemoveLast();
+        }
+    }
+
+    // eliminate dead value info
+    {
+        onnx::GraphProto* graph = model.mutable_graph();
+
+        for (size_t i = 0; i < dead_outputs.size(); i++)
+        {
+            const std::string& dead_output = dead_outputs[i];
+
+            for (int j = 0; j < graph->value_info_size(); j++)
+            {
+                if (graph->value_info(j).name() == dead_output)
+                {
+                    //  ..... j .......
+                    const int graph_value_info_size = graph->value_info_size();
+                    for (int k = j; k < graph_value_info_size - 1; k++)
+                    {
+                        graph->mutable_node()->SwapElements(k, k + 1);
+                    }
+
+                    //  ..... ....... j
+                    graph->mutable_node()->RemoveLast();
+
+                    break;
+                }
+            }
+        }
+    }
+
+    // collect all dead functions
+    std::vector<int> dead_function_indexes;
+    {
+        const onnx::GraphProto& graph = model.graph();
+
+        std::unordered_set<int> live_function_indexes;
+        for (int i = 0; i < graph.node_size(); i++)
+        {
+            const std::string& op_type = graph.node(i).op_type();
+
+            for (int j = 0; j < model.functions_size(); j++)
+            {
+                const onnx::FunctionProto& function = model.functions(j);
+
+                if (function.name() == op_type)
+                {
+                    live_function_indexes.insert(j);
+                    break;
+                }
+            }
+        }
+
+        // find nested live functions
+        while (1)
+        {
+            bool new_nested_live_function = false;
+
+            for (int i = 0; i < model.functions_size(); i++)
+            {
+                if (live_function_indexes.find(i) == live_function_indexes.end())
+                    continue;
+
+                const onnx::FunctionProto& function = model.functions(i);
+
+                for (int j = 0; j < function.node_size(); j++)
+                {
+                    const std::string& op_type = function.node(j).op_type();
+
+                    for (int k = 0; k < model.functions_size(); k++)
+                    {
+                        const onnx::FunctionProto& nested_function = model.functions(k);
+
+                        if (nested_function.name() == op_type && live_function_indexes.find(k) == live_function_indexes.end())
+                        {
+                            // nested live function added
+                            live_function_indexes.insert(k);
+
+                            new_nested_live_function = true;
+                        }
+                    }
+                }
+            }
+
+            if (!new_nested_live_function)
+                break;
+        }
+
+        for (int i = model.functions_size() - 1; i >= 0; i--)
+        {
+            if (live_function_indexes.find(i) == live_function_indexes.end())
+            {
+                dead_function_indexes.push_back(i);
+            }
+        }
+    }
+
+    // eliminate dead funtions
+    {
+        for (size_t i = 0; i < dead_function_indexes.size(); i++)
+        {
+            const int dead_function_index = dead_function_indexes[i];
+
+            //  ..... dfi .......
+            const int model_functions_size = model.functions_size();
+            for (int j = dead_function_index; j < model_functions_size - 1; j++)
+            {
+                model.mutable_functions()->SwapElements(j, j + 1);
+            }
+
+            //  ..... ....... dfi
+            model.mutable_functions()->RemoveLast();
+        }
+    }
+
+    // eliminate dead initializers
+    {
+        onnx::GraphProto* graph = model.mutable_graph();
+
+        std::unordered_set<std::string> live_inputs;
+        for (int i = 0; i < graph->node_size(); i++)
+        {
+            const onnx::NodeProto& node = graph->node(i);
+
+            for (int j = 0; j < node.input_size(); j++)
+            {
+                live_inputs.insert(node.input(j));
+            }
+        }
+
+        // find live inputs in functions
+        for (int i = 0; i < model.functions_size(); i++)
+        {
+            const onnx::FunctionProto& function = model.functions(i);
+
+            for (int j = 0; j < function.node_size(); j++)
+            {
+                const onnx::NodeProto& node = function.node(j);
+
+                for (int k = 0; k < node.input_size(); k++)
+                {
+                    live_inputs.insert(node.input(k));
+                }
+            }
+        }
+
+        std::vector<int> dead_initializer_indexes;
+        for (int i = graph->initializer_size() - 1; i >= 0; i--)
+        {
+            if (live_inputs.find(graph->initializer(i).name()) == live_inputs.end())
+            {
+                dead_initializer_indexes.push_back(i);
+            }
+        }
+
+        for (size_t i = 0; i < dead_initializer_indexes.size(); i++)
+        {
+            const int dead_initializer_index = dead_initializer_indexes[i];
+
+            //  ..... dii .......
+            const int graph_initializer_size = graph->initializer_size();
+            for (int j = dead_initializer_index; j < graph_initializer_size - 1; j++)
+            {
+                graph->mutable_initializer()->SwapElements(j, j + 1);
+            }
+
+            //  ..... ....... dii
+            graph->mutable_initializer()->RemoveLast();
+        }
+    }
+}
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/dead_code_elimination.h b/tools/pnnx/src/pass_onnx/dead_code_elimination.h
new file mode 100644
index 00000000000..b890b6a7d7c
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/dead_code_elimination.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+void dead_code_elimination(onnx::ModelProto& model);
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/eliminate_noop.cpp b/tools/pnnx/src/pass_onnx/eliminate_noop.cpp
new file mode 100644
index 00000000000..cf011f9c29b
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/eliminate_noop.cpp
@@ -0,0 +1,59 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "eliminate_noop.h"
+
+#include <sstream>
+#include <string>
+#include <unordered_set>
+
+#include <onnxruntime_c_api.h>
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+void eliminate_noop(onnx::ModelProto& model)
+{
+    onnx::GraphProto* graph = model.mutable_graph();
+
+    for (int i = 0; i < graph->node_size(); i++)
+    {
+        const onnx::NodeProto& node = graph->node(i);
+        const std::string& op_type = node.op_type();
+
+        if (op_type == "Identity" || op_type == "aten_copy")
+        {
+            const std::string& input_name = node.input(0);
+            const std::string& output_name = node.output(0);
+
+            for (int j = i + 1; j < graph->node_size(); j++)
+            {
+                onnx::NodeProto* node2 = graph->mutable_node(j);
+
+                for (int k = 0; k < node2->input_size(); k++)
+                {
+                    if (node2->input(k) == output_name)
+                    {
+                        node2->set_input(k, input_name);
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/eliminate_noop.h b/tools/pnnx/src/pass_onnx/eliminate_noop.h
new file mode 100644
index 00000000000..74d5781a32a
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/eliminate_noop.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+void eliminate_noop(onnx::ModelProto& model);
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/fold_constants.cpp b/tools/pnnx/src/pass_onnx/fold_constants.cpp
new file mode 100644
index 00000000000..cb5b3a75d95
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/fold_constants.cpp
@@ -0,0 +1,435 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fold_constants.h"
+
+#include <sstream>
+#include <string>
+#include <unordered_set>
+
+#include <onnxruntime_c_api.h>
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+static size_t sizeof_onnx_datatype(ONNXTensorElementDataType type)
+{
+    switch (type)
+    {
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING:
+        return 0;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ:
+        return 1;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:
+        return 2;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+        return 4;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64:
+        return 8;
+    case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128:
+        return 16;
+    default:
+        break;
+    }
+
+    return 0;
+}
+
+void fold_constants(onnx::ModelProto& model)
+{
+    // collect initializers
+    std::unordered_set<std::string> initializers;
+    {
+        const onnx::GraphProto& graph = model.graph();
+        for (int i = 0; i < graph.initializer_size(); i++)
+        {
+            initializers.insert(graph.initializer(i).name());
+        }
+    }
+
+    // collect all outputs that have no links with graph inputs
+    std::vector<std::string> foldable_constants;
+    {
+        const onnx::GraphProto& graph = model.graph();
+
+        std::unordered_set<std::string> foldable_outputs;
+        std::unordered_set<std::string> non_foldable_outputs;
+        for (int i = 0; i < graph.input_size(); i++)
+        {
+            non_foldable_outputs.insert(graph.input(i).name());
+        }
+
+        for (int i = 0; i < graph.node_size(); i++)
+        {
+            const onnx::NodeProto& node = graph.node(i);
+
+            const std::string& op_type = node.op_type();
+
+            bool is_outputs_foldable = true;
+            for (int j = 0; j < node.input_size(); j++)
+            {
+                if (non_foldable_outputs.find(node.input(j)) != non_foldable_outputs.end())
+                {
+                    is_outputs_foldable = false;
+                    break;
+                }
+            }
+
+            // TODO whitelist for static shape
+            // aten::size
+            // aten::_shape_as_tensor
+            if (op_type == "aten_new_empty"
+                    || op_type == "aten_new_full"
+                    || op_type == "aten_new_ones"
+                    || op_type == "aten_new_zeros"
+                    || op_type == "aten_empty_like"
+                    || op_type == "aten_full_like"
+                    || op_type == "aten_ones_like"
+                    || op_type == "aten_zeros_like")
+            {
+                is_outputs_foldable = true;
+            }
+
+            // TODO whitelist for static shape
+            if (op_type == "Shape")
+            {
+                is_outputs_foldable = true;
+            }
+
+            // TODO whitelist for static type
+            if (op_type == "CastLike")
+            {
+                is_outputs_foldable = non_foldable_outputs.find(node.input(0)) == non_foldable_outputs.end();
+            }
+
+            if (!is_outputs_foldable)
+            {
+                for (int j = 0; j < node.input_size(); j++)
+                {
+                    if (non_foldable_outputs.find(node.input(j)) == non_foldable_outputs.end())
+                    {
+                        // some input/output may have empty name, it causes trouble, skip it
+                        if (node.input(j).empty())
+                            continue;
+
+                        foldable_outputs.insert(node.input(j));
+                    }
+                }
+
+                for (int j = 0; j < node.output_size(); j++)
+                {
+                    non_foldable_outputs.insert(node.output(j));
+                }
+            }
+        }
+
+        // skip initializers
+        for (const std::string& x : foldable_outputs)
+        {
+            if (initializers.find(x) == initializers.end())
+            {
+                foldable_constants.push_back(x);
+            }
+        }
+    }
+
+    if (foldable_constants.empty())
+        return;
+
+    onnx::GraphProto* graph = model.mutable_graph();
+
+    // save original outputs
+    std::vector<std::string> orig_outputs;
+    {
+        for (int i = 0; i < graph->output_size(); i++)
+        {
+            orig_outputs.push_back(graph->output(i).name());
+        }
+    }
+
+    // add foldable outputs to onnx output
+    {
+        graph->clear_output();
+
+        for (size_t i = 0; i < foldable_constants.size(); i++)
+        {
+            graph->add_output()->set_name(foldable_constants[i]);
+        }
+    }
+
+    // generate temp onnx graph
+    std::string tmp_onnx_data;
+    {
+        std::stringstream tmp_onnx_data_ss;
+        if (!model.SerializeToOstream(&tmp_onnx_data_ss))
+        {
+            fprintf(stderr, "write onnx failed\n");
+            return;
+        }
+
+        tmp_onnx_data = tmp_onnx_data_ss.str();
+    }
+
+    // onnxrt inference
+    {
+        const OrtApi* ort_api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+
+        OrtStatus* ort_status = 0;
+
+        OrtEnv* ort_env = 0;
+        ort_status = ort_api->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "pnnx", &ort_env);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort CreateEnv failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        OrtSessionOptions* ort_session_opt = 0;
+        ort_status = ort_api->CreateSessionOptions(&ort_session_opt);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort CreateSessionOptions failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        ort_status = ort_api->SetSessionGraphOptimizationLevel(ort_session_opt, ORT_DISABLE_ALL);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort SetSessionGraphOptimizationLevel failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        // ort_status = ort_api->SetIntraOpNumThreads(ort_session_opt, 4);
+        // if (ort_status)
+        // {
+        //     fprintf(stderr, "ort SetIntraOpNumThreads failed %s\n", ort_api->GetErrorMessage(ort_status));
+        // }
+        //
+        // ort_status = ort_api->SetInterOpNumThreads(ort_session_opt, 4);
+        // if (ort_status)
+        // {
+        //     fprintf(stderr, "ort SetInterOpNumThreads failed %s\n", ort_api->GetErrorMessage(ort_status));
+        // }
+
+        OrtSession* ort_session = 0;
+        ort_status = ort_api->CreateSessionFromArray(ort_env, (const void*)tmp_onnx_data.data(), tmp_onnx_data.size(), ort_session_opt, &ort_session);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort CreateSession failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        OrtRunOptions* ort_run_opt = 0;
+        ort_status = ort_api->CreateRunOptions(&ort_run_opt);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort CreateRunOptions failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        OrtAllocator* ort_allocator = 0;
+        ort_status = ort_api->GetAllocatorWithDefaultOptions(&ort_allocator);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort GetAllocatorWithDefaultOptions failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        std::vector<const char*> input_names;
+        std::vector<OrtValue*> inputs;
+        for (int i = 0; i < graph->input_size(); i++)
+        {
+            const onnx::ValueInfoProto& value = graph->input(i);
+
+            std::vector<int64_t> shape;
+            const onnx::TensorShapeProto& tsp = value.type().tensor_type().shape();
+            for (int k = 0; k < tsp.dim_size(); k++)
+            {
+                // TODO has_dim_value ?
+                shape.push_back(tsp.dim(k).dim_value());
+            }
+
+            ONNXTensorElementDataType datatype = (ONNXTensorElementDataType)value.type().tensor_type().elem_type();
+
+            OrtValue* ort_val = 0;
+            ort_status = ort_api->CreateTensorAsOrtValue(ort_allocator, (const int64_t*)shape.data(), shape.size(), datatype, &ort_val);
+            if (ort_status)
+            {
+                fprintf(stderr, "ort CreateTensorAsOrtValue failed %s\n", ort_api->GetErrorMessage(ort_status));
+            }
+
+            input_names.push_back(value.name().c_str());
+            inputs.push_back(ort_val);
+        }
+
+        std::vector<const char*> output_names;
+        std::vector<OrtValue*> outputs;
+        for (size_t i = 0; i < foldable_constants.size(); i++)
+        {
+            output_names.push_back(foldable_constants[i].c_str());
+            outputs.push_back(0);
+        }
+
+        ort_status = ort_api->Run(ort_session, ort_run_opt,
+                                  input_names.data(), inputs.data(), input_names.size(),
+                                  output_names.data(), output_names.size(), outputs.data());
+        if (ort_status)
+        {
+            fprintf(stderr, "ort Run failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        // TODO get output data
+
+        // graph->clear_output();
+
+        for (size_t i = 0; i < output_names.size(); i++)
+        {
+            OrtTensorTypeAndShapeInfo* info = 0;
+            ort_status = ort_api->GetTensorTypeAndShape(outputs[i], &info);
+            if (ort_status)
+            {
+                fprintf(stderr, "ort GetTensorTypeAndShape failed %s\n", ort_api->GetErrorMessage(ort_status));
+            }
+
+            ONNXTensorElementDataType datatype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+            ort_status = ort_api->GetTensorElementType(info, &datatype);
+            if (ort_status)
+            {
+                fprintf(stderr, "ort GetTensorElementType failed %s\n", ort_api->GetErrorMessage(ort_status));
+            }
+
+            size_t out_dims = 0;
+            ort_status = ort_api->GetDimensionsCount(info, &out_dims);
+            if (ort_status)
+            {
+                fprintf(stderr, "ort GetDimensionsCount failed %s\n", ort_api->GetErrorMessage(ort_status));
+            }
+
+            // fprintf(stderr, "   out_dims = %lu\n", out_dims);
+
+            std::vector<int64_t> out_shape;
+            out_shape.resize(out_dims);
+            ort_status = ort_api->GetDimensions(info, out_shape.data(), out_dims);
+            if (ort_status)
+            {
+                fprintf(stderr, "ort GetDimensions failed %s\n", ort_api->GetErrorMessage(ort_status));
+            }
+
+            void* tensor_data = 0;
+            ort_status = ort_api->GetTensorMutableData(outputs[i], &tensor_data);
+            if (ort_status)
+            {
+                fprintf(stderr, "ort GetTensorMutableData failed %s\n", ort_api->GetErrorMessage(ort_status));
+            }
+
+            size_t elemcount = 0;
+            ort_status = ort_api->GetTensorShapeElementCount(info, &elemcount);
+            if (ort_status)
+            {
+                fprintf(stderr, "ort GetTensorShapeElementCount failed %s\n", ort_api->GetErrorMessage(ort_status));
+            }
+
+            // fprintf(stderr, "%16s = ", output_names[i]);
+            // for (size_t j = 0; j < out_dims; j++)
+            // {
+            //     fprintf(stderr, "%lu ", out_shape[j]);
+            // }
+            // fprintf(stderr, "\n");
+
+            // unlink any node that has this output
+            {
+                for (int j = 0; j < graph->node_size(); j++)
+                {
+                    const onnx::NodeProto& node = graph->node(j);
+
+                    bool is_producer = false;
+                    int producer_node_output_index = -1;
+                    for (int k = 0; k < node.output_size(); k++)
+                    {
+                        if (node.output(k) == output_names[i])
+                        {
+                            is_producer = true;
+                            producer_node_output_index = k;
+                            break;
+                        }
+                    }
+
+                    if (is_producer)
+                    {
+                        graph->mutable_node(j)->set_output(producer_node_output_index, std::string("pnnx_unlink_") + output_names[i]);
+                        break;
+                    }
+                }
+            }
+
+            // create initializer
+            {
+                onnx::TensorProto* tp = graph->add_initializer();
+                tp->set_name(output_names[i]);
+
+                for (size_t j = 0; j < out_dims; j++)
+                {
+                    tp->add_dims(out_shape[j]);
+                }
+
+                tp->set_data_type((int32_t)datatype);
+
+                std::string* data = tp->mutable_raw_data();
+                data->resize(sizeof_onnx_datatype(datatype) * elemcount);
+                memcpy((void*)data->data(), tensor_data, sizeof_onnx_datatype(datatype) * elemcount);
+            }
+
+            ort_api->ReleaseTensorTypeAndShapeInfo(info);
+        }
+
+        for (size_t i = 0; i < input_names.size(); i++)
+        {
+            ort_api->ReleaseValue(inputs[i]);
+        }
+
+        for (size_t i = 0; i < output_names.size(); i++)
+        {
+            ort_api->ReleaseValue(outputs[i]);
+        }
+
+        ort_api->ReleaseRunOptions(ort_run_opt);
+        ort_api->ReleaseSession(ort_session);
+        ort_api->ReleaseSessionOptions(ort_session_opt);
+        ort_api->ReleaseEnv(ort_env);
+    }
+
+    // restore original outputs
+    {
+        graph->clear_output();
+
+        for (size_t i = 0; i < orig_outputs.size(); i++)
+        {
+            graph->add_output()->set_name(orig_outputs[i]);
+        }
+    }
+}
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/fold_constants.h b/tools/pnnx/src/pass_onnx/fold_constants.h
new file mode 100644
index 00000000000..9728a7aaba6
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/fold_constants.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+void fold_constants(onnx::ModelProto& model);
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/inline_containers.cpp b/tools/pnnx/src/pass_onnx/inline_containers.cpp
new file mode 100644
index 00000000000..a4ea80af614
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/inline_containers.cpp
@@ -0,0 +1,189 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "inline_containers.h"
+
+#include <map>
+#include <string>
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+static bool string_starts_with(const std::string& s, const std::string& s2)
+{
+    return strncmp(s.c_str(), s2.c_str(), s2.size()) == 0;
+}
+
+void inline_containers(onnx::ModelProto& model)
+{
+    onnx::GraphProto* graph = model.mutable_graph();
+
+    for (int i = 0; i < graph->node_size(); i++)
+    {
+        onnx::NodeProto* node = graph->mutable_node(i);
+
+        const std::string& op_type = node->op_type();
+
+        if (node->domain().empty())
+        {
+            // native onnx op
+
+            // Constant
+            // fprintf(stderr, "   node = onnx %s\n", op_type.c_str());
+            continue;
+        }
+
+        if (string_starts_with(op_type, "torch_nn_modules_") && !string_starts_with(op_type, "torch_nn_modules_container_"))
+        {
+            // torch_nn_modules_conv_Conv2d                 _conv1_1
+            // torch_nn_modules_batchnorm_BatchNorm2d       _bn1_1
+            // torch_nn_modules_pooling_MaxPool2d           _maxpool_1_3
+            // torch_nn_modules_linear_Linear               _fc_1
+
+            // std::vector<std::string> tokens = string_split(op_type, '_');
+
+            // fprintf(stderr, "   node = nn.%s\n", tokens[4].c_str());
+            continue;
+        }
+
+        if (string_starts_with(op_type, "aten_") || string_starts_with(op_type, "_aten_"))
+        {
+            // aten_view
+
+            // std::vector<std::string> tokens = string_split(op_type, '_');
+
+            // fprintf(stderr, "   node = aten::%s\n", tokens[1].c_str());
+            continue;
+        }
+
+        if (string_starts_with(op_type, "prims_"))
+        {
+            // prims_convert_element_type
+            continue;
+        }
+
+        // find function
+        int function_index = -1;
+        for (int j = 0; j < model.functions_size(); j++)
+        {
+            const onnx::FunctionProto& function = model.functions(j);
+            if (function.name() == op_type)
+            {
+                function_index = j;
+                break;
+            }
+        }
+
+        if (function_index == -1)
+        {
+            fprintf(stderr, "no such function with name %s\n", op_type.c_str());
+            continue;
+        }
+
+        // ok, this is a function, inline it at node
+        // fprintf(stderr, "inline %s\n", op_type.c_str());
+
+        const onnx::FunctionProto& function = model.functions(function_index);
+
+        // build function input and output name remap
+        std::map<std::string, std::string> input_output_remap;
+        {
+            for (int j = 0; j < node->input_size(); j++)
+            {
+                const std::string& node_input = node->input(j);
+                const std::string& func_input = function.input(j);
+
+                input_output_remap[func_input] = node_input;
+            }
+            for (int j = 0; j < node->output_size(); j++)
+            {
+                const std::string& node_output = node->output(j);
+                const std::string& func_output = function.output(j);
+
+                input_output_remap[func_output] = node_output;
+            }
+        }
+
+        // append function nodes to graph
+        {
+            graph->mutable_node()->Reserve(graph->node_size() + function.node_size());
+            for (int j = 0; j < function.node_size(); j++)
+            {
+                onnx::NodeProto* inlined_node = graph->add_node();
+                inlined_node->CopyFrom(function.node(j));
+
+                // prefix with caller node name
+                inlined_node->set_name(node->name() + "/" + inlined_node->name());
+
+                // reset input output
+                for (int j = 0; j < inlined_node->input_size(); j++)
+                {
+                    const std::string& node_input = inlined_node->input(j);
+                    if (input_output_remap.find(node_input) != input_output_remap.end())
+                    {
+                        inlined_node->set_input(j, input_output_remap.at(node_input));
+                    }
+                    else
+                    {
+                        // graph->add_value_info()->set_name(node->name() + "/" + node_input);
+                        inlined_node->set_input(j, node->name() + "/" + node_input);
+                    }
+                }
+                for (int j = 0; j < inlined_node->output_size(); j++)
+                {
+                    const std::string& node_output = inlined_node->output(j);
+                    if (input_output_remap.find(node_output) != input_output_remap.end())
+                    {
+                        inlined_node->set_output(j, input_output_remap.at(node_output));
+                    }
+                    else
+                    {
+                        // graph->add_value_info()->set_name(node->name() + "/" + node_output);
+                        inlined_node->set_output(j, node->name() + "/" + node_output);
+                    }
+                }
+            }
+        }
+
+        // swap inlined function nodes to caller
+        {
+            //  ..... cni ....... 0 1 2 3 4
+            const int graph_node_size = graph->node_size();
+            for (int j = 0; j < function.node_size(); j++)
+            {
+                for (int k = graph_node_size - 1; k > i; k--)
+                {
+                    graph->mutable_node()->SwapElements(k, k - 1);
+                }
+            }
+
+            //  ..... 0 1 2 3 4 cni .......
+            for (int j = i + function.node_size(); j < graph_node_size - 1; j++)
+            {
+                graph->mutable_node()->SwapElements(j, j + 1);
+            }
+
+            //  ..... 0 1 2 3 4 ....... cni
+            graph->mutable_node()->RemoveLast();
+        }
+
+        // inlined node may be function
+        i -= 1;
+    }
+}
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/inline_containers.h b/tools/pnnx/src/pass_onnx/inline_containers.h
new file mode 100644
index 00000000000..56b21f47b37
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/inline_containers.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+void inline_containers(onnx::ModelProto& model);
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/model_stat.cpp b/tools/pnnx/src/pass_onnx/model_stat.cpp
new file mode 100644
index 00000000000..6c61dfa2bd4
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/model_stat.cpp
@@ -0,0 +1,581 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "model_stat.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+static bool string_starts_with(const std::string& s, const std::string& s2)
+{
+    return strncmp(s.c_str(), s2.c_str(), s2.size()) == 0;
+}
+
+ModelStat get_model_stat(const onnx::ModelProto& model)
+{
+    ModelStat stat;
+
+    const onnx::GraphProto& graph = model.graph();
+
+    stat.node_size = graph.node_size();
+    for (int i = 0; i < model.functions_size(); i++)
+    {
+        stat.node_size += model.functions(i).node_size();
+    }
+
+    stat.initializer_size = graph.initializer_size();
+    stat.functions_size = model.functions_size();
+
+    for (int i = 0; i < graph.node_size(); i++)
+    {
+        const onnx::NodeProto& node = graph.node(i);
+
+        const std::string& op_type = node.op_type();
+
+        if (node.domain().empty())
+        {
+            // native onnx op
+            stat.onnx_count += 1;
+
+            if (stat.onnx_op_count.find(op_type) == stat.onnx_op_count.end())
+            {
+                stat.onnx_op_count[op_type] = 1;
+            }
+            else
+            {
+                stat.onnx_op_count[op_type] = stat.onnx_op_count[op_type] + 1;
+            }
+            continue;
+        }
+
+        if (string_starts_with(op_type, "aten_") || string_starts_with(op_type, "_aten_"))
+        {
+            // aten_view
+            stat.aten_count += 1;
+
+            std::string simname = op_type;
+            if (simname[0] == '_')
+                simname = simname.substr(1);
+            simname[4] = '.';
+
+            if (stat.aten_op_count.find(simname) == stat.aten_op_count.end())
+            {
+                stat.aten_op_count[simname] = 1;
+            }
+            else
+            {
+                stat.aten_op_count[simname] = stat.aten_op_count[simname] + 1;
+            }
+            continue;
+        }
+
+        if (string_starts_with(op_type, "prims_"))
+        {
+            // prims_convert_element_type
+            stat.prims_count += 1;
+
+            std::string simname = op_type;
+            simname[5] = '.';
+
+            if (stat.prims_op_count.find(simname) == stat.prims_op_count.end())
+            {
+                stat.prims_op_count[simname] = 1;
+            }
+            else
+            {
+                stat.prims_op_count[simname] = stat.prims_op_count[simname] + 1;
+            }
+            continue;
+        }
+
+        if (string_starts_with(op_type, "torch_nn_modules_") || string_starts_with(op_type, "nn_"))
+        {
+            // torch_nn_modules_conv_Conv2d                 _conv1_1
+            stat.nn_module_count += 1;
+
+            std::string simname;
+            if (string_starts_with(op_type, "nn_"))
+            {
+                // nn_Conv2d_i -> nn.Conv2d
+                simname = op_type;
+                simname[2] = '.';
+                if (simname.find_first_of('_') != std::string::npos)
+                    simname = simname.substr(0, simname.find_first_of('_'));
+            }
+            else
+            {
+                // torch_nn_modules_conv_Conv2d_xyz -> nn.Conv2d
+                char nn_type[256];
+                sscanf(op_type.c_str() + sizeof("torch_nn_modules_") - 1, "%*[^_]_%255[^_]", nn_type);
+                simname = std::string("nn.") + nn_type;
+            }
+
+            if (stat.nn_module_op_count.find(simname) == stat.nn_module_op_count.end())
+            {
+                stat.nn_module_op_count[simname] = 1;
+            }
+            else
+            {
+                stat.nn_module_op_count[simname] = stat.nn_module_op_count[simname] + 1;
+            }
+            continue;
+        }
+
+        // custom module op
+        stat.custom_module_count += 1;
+    }
+
+    // collect called functions
+    std::unordered_set<std::string> called_functions;
+    {
+        for (int i = 0; i < graph.node_size(); i++)
+        {
+            const onnx::NodeProto& node = graph.node(i);
+
+            const std::string& op_type = node.op_type();
+
+            if (node.domain().empty())
+            {
+                // native onnx op
+                continue;
+            }
+
+            if (string_starts_with(op_type, "aten_") || string_starts_with(op_type, "_aten_"))
+            {
+                // aten_view
+                continue;
+            }
+
+            if (string_starts_with(op_type, "prims_"))
+            {
+                // prims_convert_element_type
+                continue;
+            }
+
+            if ((string_starts_with(op_type, "torch_nn_modules_") && !string_starts_with(op_type, "torch_nn_modules_container_")) || string_starts_with(op_type, "nn_"))
+            {
+                // torch_nn_modules_conv_Conv2d                 _conv1_1
+                continue;
+            }
+
+            called_functions.insert(op_type);
+        }
+
+        while (1)
+        {
+            bool new_called_function = false;
+
+            for (int i = 0; i < model.functions_size(); i++)
+            {
+                const onnx::FunctionProto& function = model.functions(i);
+
+                if (called_functions.find(function.name()) == called_functions.end())
+                    continue;
+
+                for (int j = 0; j < function.node_size(); j++)
+                {
+                    const onnx::NodeProto& node = function.node(j);
+
+                    const std::string& op_type = node.op_type();
+
+                    if (node.domain().empty())
+                    {
+                        // native onnx op
+                        continue;
+                    }
+
+                    if (string_starts_with(op_type, "aten_") || string_starts_with(op_type, "_aten_"))
+                    {
+                        // aten_view
+                        continue;
+                    }
+
+                    if (string_starts_with(op_type, "prims_"))
+                    {
+                        // prims_convert_element_type
+                        continue;
+                    }
+
+                    if ((string_starts_with(op_type, "torch_nn_modules_") && !string_starts_with(op_type, "torch_nn_modules_container_")) || string_starts_with(op_type, "nn_"))
+                    {
+                        // torch_nn_modules_conv_Conv2d                 _conv1_1
+                        continue;
+                    }
+
+                    if (called_functions.find(op_type) == called_functions.end())
+                    {
+                        called_functions.insert(op_type);
+                        new_called_function = true;
+                    }
+                }
+            }
+
+            if (!new_called_function)
+                break;
+        }
+    }
+
+    for (int i = 0; i < model.functions_size(); i++)
+    {
+        const onnx::FunctionProto& function = model.functions(i);
+
+        if (called_functions.find(function.name()) == called_functions.end())
+            continue;
+
+        for (int j = 0; j < function.node_size(); j++)
+        {
+            const onnx::NodeProto& node = function.node(j);
+
+            const std::string& op_type = node.op_type();
+
+            if (node.domain().empty())
+            {
+                // native onnx op
+                stat.onnx_count += 1;
+
+                if (stat.onnx_op_count.find(op_type) == stat.onnx_op_count.end())
+                {
+                    stat.onnx_op_count[op_type] = 1;
+                }
+                else
+                {
+                    stat.onnx_op_count[op_type] = stat.onnx_op_count[op_type] + 1;
+                }
+                continue;
+            }
+
+            if (string_starts_with(op_type, "aten_") || string_starts_with(op_type, "_aten_"))
+            {
+                // aten_view
+                stat.aten_count += 1;
+
+                std::string simname = op_type;
+                if (simname[0] == '_')
+                    simname = simname.substr(1);
+                simname[4] = '.';
+
+                if (stat.aten_op_count.find(simname) == stat.aten_op_count.end())
+                {
+                    stat.aten_op_count[simname] = 1;
+                }
+                else
+                {
+                    stat.aten_op_count[simname] = stat.aten_op_count[simname] + 1;
+                }
+                continue;
+            }
+
+            if (string_starts_with(op_type, "prims_"))
+            {
+                // prims_convert_element_type
+                stat.prims_count += 1;
+
+                std::string simname = op_type;
+                simname[5] = '.';
+
+                if (stat.prims_op_count.find(simname) == stat.prims_op_count.end())
+                {
+                    stat.prims_op_count[simname] = 1;
+                }
+                else
+                {
+                    stat.prims_op_count[simname] = stat.prims_op_count[simname] + 1;
+                }
+                continue;
+            }
+
+            if (string_starts_with(op_type, "torch_nn_modules_") || string_starts_with(op_type, "nn_"))
+            {
+                // torch_nn_modules_conv_Conv2d                 _conv1_1
+                stat.nn_module_count += 1;
+
+                std::string simname;
+                if (string_starts_with(op_type, "nn_"))
+                {
+                    simname = op_type;
+                    simname[2] = '.';
+                    if (simname.find_first_of('_') != std::string::npos)
+                        simname = simname.substr(0, simname.find_first_of('_'));
+                }
+                else
+                {
+                    // torch_nn_modules_conv_Conv2d_xyz -> nn_Conv2d_i
+                    char nn_type[256];
+                    sscanf(op_type.c_str() + sizeof("torch_nn_modules_") - 1, "%*[^_]_%255[^_]", nn_type);
+                    simname = std::string("nn.") + nn_type;
+                }
+
+                if (stat.nn_module_op_count.find(simname) == stat.nn_module_op_count.end())
+                {
+                    stat.nn_module_op_count[simname] = 1;
+                }
+                else
+                {
+                    stat.nn_module_op_count[simname] = stat.nn_module_op_count[simname] + 1;
+                }
+                continue;
+            }
+
+            // custom module op
+            stat.custom_module_count += 1;
+        }
+    }
+
+    return stat;
+}
+
+void print_model_stat(const ModelStat& oldstat, const ModelStat& newstat)
+{
+    std::set<std::string> nn_module_op_count;
+    std::set<std::string> aten_op_count;
+    std::set<std::string> prims_op_count;
+    std::set<std::string> onnx_op_count;
+    {
+        for (auto& x : oldstat.nn_module_op_count)
+        {
+            nn_module_op_count.insert(x.first);
+        }
+        for (auto& x : newstat.nn_module_op_count)
+        {
+            nn_module_op_count.insert(x.first);
+        }
+
+        for (auto& x : oldstat.aten_op_count)
+        {
+            aten_op_count.insert(x.first);
+        }
+        for (auto& x : newstat.aten_op_count)
+        {
+            aten_op_count.insert(x.first);
+        }
+
+        for (auto& x : oldstat.prims_op_count)
+        {
+            prims_op_count.insert(x.first);
+        }
+        for (auto& x : newstat.prims_op_count)
+        {
+            prims_op_count.insert(x.first);
+        }
+
+        for (auto& x : oldstat.onnx_op_count)
+        {
+            onnx_op_count.insert(x.first);
+        }
+        for (auto& x : newstat.onnx_op_count)
+        {
+            onnx_op_count.insert(x.first);
+        }
+    }
+
+    // resolve longest text
+    int max_op_name_length = 16;
+    for (auto& x : nn_module_op_count)
+    {
+        max_op_name_length = std::max(max_op_name_length, (int)x.size());
+    }
+    for (auto& x : aten_op_count)
+    {
+        max_op_name_length = std::max(max_op_name_length, (int)x.size());
+    }
+    for (auto& x : prims_op_count)
+    {
+        max_op_name_length = std::max(max_op_name_length, (int)x.size());
+    }
+    for (auto& x : onnx_op_count)
+    {
+        max_op_name_length = std::max(max_op_name_length, (int)x.size());
+    }
+
+    fprintf(stderr, "┌─");
+    for (int i = 0; i < max_op_name_length; i++)
+        fprintf(stderr, "─");
+    fprintf(stderr, "─┬──────────┬──────────┐\n");
+
+    fprintf(stderr, "│ %-*s │ orig     │ opt      │\n", max_op_name_length, "");
+
+    fprintf(stderr, "├─");
+    for (int i = 0; i < max_op_name_length; i++)
+        fprintf(stderr, "─");
+    fprintf(stderr, "─┼──────────┼──────────┤\n");
+
+    if (newstat.node_size < oldstat.node_size)
+        fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, "node", oldstat.node_size, newstat.node_size);
+    else
+        fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, "node", oldstat.node_size, newstat.node_size);
+
+    fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, "initializer", oldstat.initializer_size, newstat.initializer_size);
+
+    if (newstat.functions_size < oldstat.functions_size)
+        fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, "functions", oldstat.functions_size, newstat.functions_size);
+    else
+        fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, "functions", oldstat.functions_size, newstat.functions_size);
+
+    fprintf(stderr, "├─");
+    for (int i = 0; i < max_op_name_length; i++)
+        fprintf(stderr, "─");
+    fprintf(stderr, "─┼──────────┼──────────┤\n");
+
+    if (newstat.nn_module_count < oldstat.nn_module_count)
+        fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, "nn module op", oldstat.nn_module_count, newstat.nn_module_count);
+    else
+        fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, "nn module op", oldstat.nn_module_count, newstat.nn_module_count);
+
+    if (newstat.custom_module_count < oldstat.custom_module_count)
+        fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, "custom module op", oldstat.custom_module_count, newstat.custom_module_count);
+    else
+        fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, "custom module op", oldstat.custom_module_count, newstat.custom_module_count);
+
+    if (newstat.aten_count < oldstat.aten_count)
+        fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, "aten op", oldstat.aten_count, newstat.aten_count);
+    else
+        fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, "aten op", oldstat.aten_count, newstat.aten_count);
+
+    if (newstat.prims_count < oldstat.prims_count)
+        fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, "prims op", oldstat.prims_count, newstat.prims_count);
+    else
+        fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, "prims op", oldstat.prims_count, newstat.prims_count);
+
+    if (newstat.onnx_count < oldstat.onnx_count)
+        fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, "onnx native op", oldstat.onnx_count, newstat.onnx_count);
+    else
+        fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, "onnx native op", oldstat.onnx_count, newstat.onnx_count);
+
+    fprintf(stderr, "├─");
+    for (int i = 0; i < max_op_name_length; i++)
+        fprintf(stderr, "─");
+    fprintf(stderr, "─┼──────────┼──────────┤\n");
+
+    // merge nn_module_op_count
+    {
+        for (auto x : nn_module_op_count)
+        {
+            int oldcount = 0;
+            int newcount = 0;
+            if (oldstat.nn_module_op_count.find(x) != oldstat.nn_module_op_count.end())
+            {
+                oldcount = oldstat.nn_module_op_count.at(x);
+            }
+            if (newstat.nn_module_op_count.find(x) != newstat.nn_module_op_count.end())
+            {
+                newcount = newstat.nn_module_op_count.at(x);
+            }
+
+            if (newcount < oldcount)
+                fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, x.c_str(), oldcount, newcount);
+            else
+                fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, x.c_str(), oldcount, newcount);
+        }
+
+        if (!nn_module_op_count.empty())
+        {
+            fprintf(stderr, "├─");
+            for (int i = 0; i < max_op_name_length; i++)
+                fprintf(stderr, "─");
+            fprintf(stderr, "─┼──────────┼──────────┤\n");
+        }
+    }
+
+    // merge aten_op_count
+    {
+        for (auto x : aten_op_count)
+        {
+            int oldcount = 0;
+            int newcount = 0;
+            if (oldstat.aten_op_count.find(x) != oldstat.aten_op_count.end())
+            {
+                oldcount = oldstat.aten_op_count.at(x);
+            }
+            if (newstat.aten_op_count.find(x) != newstat.aten_op_count.end())
+            {
+                newcount = newstat.aten_op_count.at(x);
+            }
+
+            if (newcount < oldcount)
+                fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, x.c_str(), oldcount, newcount);
+            else
+                fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, x.c_str(), oldcount, newcount);
+        }
+
+        if (!aten_op_count.empty())
+        {
+            fprintf(stderr, "├─");
+            for (int i = 0; i < max_op_name_length; i++)
+                fprintf(stderr, "─");
+            fprintf(stderr, "─┼──────────┼──────────┤\n");
+        }
+    }
+
+    // merge prims_op_count
+    {
+        for (auto x : prims_op_count)
+        {
+            int oldcount = 0;
+            int newcount = 0;
+            if (oldstat.prims_op_count.find(x) != oldstat.prims_op_count.end())
+            {
+                oldcount = oldstat.prims_op_count.at(x);
+            }
+            if (newstat.prims_op_count.find(x) != newstat.prims_op_count.end())
+            {
+                newcount = newstat.prims_op_count.at(x);
+            }
+
+            if (newcount < oldcount)
+                fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, x.c_str(), oldcount, newcount);
+            else
+                fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, x.c_str(), oldcount, newcount);
+        }
+
+        if (!prims_op_count.empty())
+        {
+            fprintf(stderr, "├─");
+            for (int i = 0; i < max_op_name_length; i++)
+                fprintf(stderr, "─");
+            fprintf(stderr, "─┼──────────┼──────────┤\n");
+        }
+    }
+
+    // merge onnx_op_count
+    {
+        for (auto x : onnx_op_count)
+        {
+            int oldcount = 0;
+            int newcount = 0;
+            if (oldstat.onnx_op_count.find(x) != oldstat.onnx_op_count.end())
+            {
+                oldcount = oldstat.onnx_op_count.at(x);
+            }
+            if (newstat.onnx_op_count.find(x) != newstat.onnx_op_count.end())
+            {
+                newcount = newstat.onnx_op_count.at(x);
+            }
+
+            if (newcount < oldcount)
+                fprintf(stderr, "│ %-*s │ %-8d │ \033[32m%-8d\033[0m │\n", max_op_name_length, x.c_str(), oldcount, newcount);
+            else
+                fprintf(stderr, "│ %-*s │ %-8d │ %-8d │\n", max_op_name_length, x.c_str(), oldcount, newcount);
+        }
+    }
+
+    fprintf(stderr, "└─");
+    for (int i = 0; i < max_op_name_length; i++)
+        fprintf(stderr, "─");
+    fprintf(stderr, "─┴──────────┴──────────┘\n");
+}
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/model_stat.h b/tools/pnnx/src/pass_onnx/model_stat.h
new file mode 100644
index 00000000000..dd62e67a1bc
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/model_stat.h
@@ -0,0 +1,58 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+struct ModelStat
+{
+    ModelStat()
+    {
+        node_size = 0;
+        initializer_size = 0;
+        functions_size = 0;
+
+        nn_module_count = 0;
+        custom_module_count = 0;
+        aten_count = 0;
+        prims_count = 0;
+        onnx_count = 0;
+    }
+
+    int node_size;
+    int initializer_size;
+    int functions_size;
+
+    int nn_module_count;
+    int custom_module_count;
+    int aten_count;
+    int prims_count;
+    int onnx_count;
+
+    std::map<std::string, int> nn_module_op_count;
+    std::map<std::string, int> aten_op_count;
+    std::map<std::string, int> prims_op_count;
+    std::map<std::string, int> onnx_op_count;
+};
+
+ModelStat get_model_stat(const onnx::ModelProto& model);
+
+void print_model_stat(const ModelStat& oldstat, const ModelStat& newstat);
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp
new file mode 100644
index 00000000000..0e8851f05f2
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class AdaptiveAvgPool2d : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.AdaptiveAvgPool2d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.AdaptiveAvgPool2d";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const std::vector<int>& out_shape = op->outputs[0]->shape;
+
+        if (out_shape.size() == 3)
+            op->params["output_size"] = std::vector<int> {out_shape[1], out_shape[2]};
+        else // if (out_shape.size() == 4)
+            op->params["output_size"] = std::vector<int> {out_shape[2], out_shape[3]};
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(AdaptiveAvgPool2d)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp
new file mode 100644
index 00000000000..070981e1d64
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp
@@ -0,0 +1,52 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class AdaptiveAvgPool3d : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.AdaptiveAvgPool3d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.AdaptiveAvgPool3d";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const std::vector<int>& out_shape = op->outputs[0]->shape;
+
+        if (out_shape.size() == 4)
+            op->params["output_size"] = std::vector<int> {out_shape[1], out_shape[2], out_shape[3]};
+        else // if (out_shape.size() == 5)
+            op->params["output_size"] = std::vector<int> {out_shape[2], out_shape[3], out_shape[4]};
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(AdaptiveAvgPool3d)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp
new file mode 100644
index 00000000000..5a006fe3709
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class AvgPool2d : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.AvgPool2d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.AvgPool2d";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const OnnxNodeProxy averagepool = function.typed_node("AveragePool");
+
+        std::vector<int64_t> kernel_shape = averagepool.attribute("kernel_shape");
+        std::vector<int64_t> strides = averagepool.attribute("strides");
+        std::vector<int64_t> pads = averagepool.attribute("pads");
+        int64_t ceil_mode = averagepool.attribute("ceil_mode");
+        int64_t count_include_pad = averagepool.attribute("count_include_pad");
+
+        if (pads.size() == 4)
+        {
+            pads = {pads[0], pads[1]};
+        }
+
+        op->params["kernel_size"] = kernel_shape;
+        op->params["stride"] = strides;
+        op->params["padding"] = pads;
+        op->params["ceil_mode"] = (ceil_mode != 0);
+        op->params["count_include_pad"] = (count_include_pad != 0);
+        op->params["divisor_override"] = Parameter();
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(AvgPool2d)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp
new file mode 100644
index 00000000000..ff2a5dd8aad
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class AvgPool3d : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.AvgPool3d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.AvgPool3d";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const OnnxNodeProxy averagepool = function.typed_node("AveragePool");
+
+        std::vector<int64_t> kernel_shape = averagepool.attribute("kernel_shape");
+        std::vector<int64_t> strides = averagepool.attribute("strides");
+        std::vector<int64_t> pads = averagepool.attribute("pads");
+        int64_t ceil_mode = averagepool.attribute("ceil_mode");
+        int64_t count_include_pad = averagepool.attribute("count_include_pad");
+
+        if (pads.size() == 6)
+        {
+            pads = {pads[0], pads[1], pads[2]};
+        }
+
+        op->params["kernel_size"] = kernel_shape;
+        op->params["stride"] = strides;
+        op->params["padding"] = pads;
+        op->params["ceil_mode"] = (ceil_mode != 0);
+        op->params["count_include_pad"] = (count_include_pad != 0);
+        op->params["divisor_override"] = Parameter();
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(AvgPool3d)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp b/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp
new file mode 100644
index 00000000000..c3639904d47
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class BatchNorm2d : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.BatchNorm2d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.BatchNorm2d";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        float eps;
+        if (function.has_typed_node("_aten_native_batch_norm_inference_onnx"))
+        {
+            const OnnxNodeProxy aten_native_batch_norm_inference_onnx = function.typed_node("_aten_native_batch_norm_inference_onnx");
+            eps = aten_native_batch_norm_inference_onnx.attribute("eps");
+        }
+        else
+        {
+            const OnnxNodeProxy add_eps = function.named_node("aten_add_5");
+            eps = function.find_producer(add_eps.node.input(1)).attribute("value");
+        }
+
+        const onnx::TensorProto& running_mean = function.initializer("running_mean");
+        const onnx::TensorProto& running_var = function.initializer("running_var");
+
+        op->params["num_features"] = running_mean.dims(0);
+        op->params["eps"] = eps;
+        op->params["affine"] = function.has_initializer("weight") && function.has_initializer("bias");
+
+        op->attrs["running_mean"] = running_mean;
+        op->attrs["running_var"] = running_var;
+        if (function.has_initializer("weight") && function.has_initializer("bias"))
+        {
+            op->attrs["weight"] = function.initializer("weight");
+            op->attrs["bias"] = function.initializer("bias");
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(BatchNorm2d)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp b/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp
new file mode 100644
index 00000000000..0f9405f160a
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class BatchNorm3d : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.BatchNorm3d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.BatchNorm3d";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        float eps;
+        if (function.has_typed_node("_aten_native_batch_norm_inference_onnx"))
+        {
+            const OnnxNodeProxy aten_native_batch_norm_inference_onnx = function.typed_node("_aten_native_batch_norm_inference_onnx");
+            eps = aten_native_batch_norm_inference_onnx.attribute("eps");
+        }
+        else
+        {
+            const OnnxNodeProxy add_eps = function.named_node("aten_add_5");
+            eps = function.find_producer(add_eps.node.input(1)).attribute("value");
+        }
+
+        const onnx::TensorProto& running_mean = function.initializer("running_mean");
+        const onnx::TensorProto& running_var = function.initializer("running_var");
+
+        op->params["num_features"] = running_mean.dims(0);
+        op->params["eps"] = eps;
+        op->params["affine"] = function.has_initializer("weight") && function.has_initializer("bias");
+
+        op->attrs["running_mean"] = running_mean;
+        op->attrs["running_var"] = running_var;
+        if (function.has_initializer("weight") && function.has_initializer("bias"))
+        {
+            op->attrs["weight"] = function.initializer("weight");
+            op->attrs["bias"] = function.initializer("bias");
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(BatchNorm3d)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp b/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp
new file mode 100644
index 00000000000..c9aeac561ac
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class Conv2d : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.Conv2d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv2d";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const OnnxNodeProxy aten_convolution_onnx = function.typed_node("_aten_convolution_onnx");
+
+        std::vector<int64_t> dilations = aten_convolution_onnx.attribute("dilations");
+        std::vector<int64_t> strides = aten_convolution_onnx.attribute("strides");
+        std::vector<int64_t> pads = aten_convolution_onnx.attribute("pads");
+        int64_t groups = aten_convolution_onnx.attribute("groups");
+
+        const onnx::TensorProto& weight = function.initializer("weight");
+
+        if (pads.size() == 4)
+        {
+            pads = {pads[0], pads[1]};
+        }
+
+        op->params["in_channels"] = weight.dims(1) * groups;
+        op->params["out_channels"] = weight.dims(0);
+        op->params["kernel_size"] = {weight.dims(2), weight.dims(3)};
+        op->params["dilation"] = dilations;
+        op->params["stride"] = strides;
+        op->params["padding"] = pads;
+        op->params["groups"] = groups;
+        op->params["bias"] = function.has_initializer("bias");
+        op->params["padding_mode"] = "zeros";
+
+        op->attrs["weight"] = weight;
+        if (function.has_initializer("bias"))
+        {
+            op->attrs["bias"] = function.initializer("bias");
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(Conv2d)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp b/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp
new file mode 100644
index 00000000000..6413685fcb5
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp
@@ -0,0 +1,75 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class Conv3d : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.Conv3d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Conv3d";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const OnnxNodeProxy aten_convolution_onnx = function.typed_node("_aten_convolution_onnx");
+
+        std::vector<int64_t> dilations = aten_convolution_onnx.attribute("dilations");
+        std::vector<int64_t> strides = aten_convolution_onnx.attribute("strides");
+        std::vector<int64_t> pads = aten_convolution_onnx.attribute("pads");
+        int64_t groups = aten_convolution_onnx.attribute("groups");
+
+        const onnx::TensorProto& weight = function.initializer("weight");
+
+        if (pads.size() == 6)
+        {
+            pads = {pads[0], pads[1], pads[2]};
+        }
+
+        op->params["in_channels"] = weight.dims(1) * groups;
+        op->params["out_channels"] = weight.dims(0);
+        op->params["kernel_size"] = {weight.dims(2), weight.dims(3), weight.dims(4)};
+        op->params["dilation"] = dilations;
+        op->params["stride"] = strides;
+        op->params["padding"] = pads;
+        op->params["groups"] = groups;
+        op->params["bias"] = function.has_initializer("bias");
+        op->params["padding_mode"] = "zeros";
+
+        op->attrs["weight"] = weight;
+        if (function.has_initializer("bias"))
+        {
+            op->attrs["bias"] = function.initializer("bias");
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(Conv3d)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_GELU.cpp b/tools/pnnx/src/pass_onnx/nn_GELU.cpp
new file mode 100644
index 00000000000..f5b7000e017
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_GELU.cpp
@@ -0,0 +1,54 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class GELU : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.GELU";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.GELU";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        bool approximate_none = function.has_typed_node("_aten_gelu_approximate_none");
+        bool approximate_tanh = function.has_typed_node("_aten_gelu_approximate_tanh");
+
+        if (approximate_none)
+            op->params["approximate"] = "none";
+
+        if (approximate_tanh)
+            op->params["approximate"] = "tanh";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(GELU)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp b/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp
new file mode 100644
index 00000000000..f4ecf289557
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp
@@ -0,0 +1,72 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class LayerNorm : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.LayerNorm";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.LayerNorm";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const int input_rank = op->inputs[0]->shape.size();
+
+        const OnnxNodeProxy layernormalization = function.typed_node("LayerNormalization");
+
+        int64_t axis = layernormalization.attribute("axis");
+
+        if (axis < 0)
+        {
+            axis = input_rank + axis;
+        }
+
+        std::vector<int> normalized_shape;
+        for (int i = axis; i < input_rank; i++)
+        {
+            normalized_shape.push_back(op->inputs[0]->shape[i]);
+        }
+
+        op->params["normalized_shape"] = normalized_shape;
+        op->params["eps"] = layernormalization.attribute("epsilon");
+        op->params["elementwise_affine"] = function.has_initializer("weight") && function.has_initializer("bias");
+
+        if (function.has_initializer("weight") && function.has_initializer("bias"))
+        {
+            op->attrs["weight"] = function.initializer("weight");
+            op->attrs["bias"] = function.initializer("bias");
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(LayerNorm)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_Linear.cpp b/tools/pnnx/src/pass_onnx/nn_Linear.cpp
new file mode 100644
index 00000000000..4dce81908b2
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_Linear.cpp
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class Linear : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.Linear";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.Linear";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const onnx::TensorProto& weight = function.initializer("weight");
+
+        op->params["in_features"] = weight.dims(1);
+        op->params["out_features"] = weight.dims(0);
+        op->params["bias"] = function.has_initializer("bias");
+
+        op->attrs["weight"] = weight;
+        if (function.has_initializer("bias"))
+        {
+            op->attrs["bias"] = function.initializer("bias");
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(Linear)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp
new file mode 100644
index 00000000000..47924bd33fc
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class MaxPool2d : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.MaxPool2d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.MaxPool2d";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const OnnxNodeProxy aten_max_pool_with_indices_onnx = function.typed_node("_aten_max_pool_with_indices_onnx");
+
+        std::vector<int64_t> kernel_size = aten_max_pool_with_indices_onnx.attribute("kernel_size");
+        std::vector<int64_t> dilation = aten_max_pool_with_indices_onnx.attribute("dilation");
+        std::vector<int64_t> stride = aten_max_pool_with_indices_onnx.attribute("stride");
+        std::vector<int64_t> padding = aten_max_pool_with_indices_onnx.attribute("padding");
+        int64_t ceil_mode = aten_max_pool_with_indices_onnx.attribute("ceil_mode");
+
+        if (padding.size() == 4)
+        {
+            padding = {padding[0], padding[1]};
+        }
+
+        op->params["kernel_size"] = kernel_size;
+        op->params["dilation"] = dilation;
+        op->params["stride"] = stride;
+        op->params["padding"] = padding;
+        op->params["ceil_mode"] = (ceil_mode != 0);
+        op->params["return_indices"] = (function.function.output_size() != 1);
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(MaxPool2d)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp
new file mode 100644
index 00000000000..c8c467f5ba2
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class MaxPool3d : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.MaxPool3d";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.MaxPool3d";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const OnnxNodeProxy aten_max_pool_with_indices_onnx = function.typed_node("_aten_max_pool_with_indices_onnx");
+
+        std::vector<int64_t> kernel_size = aten_max_pool_with_indices_onnx.attribute("kernel_size");
+        std::vector<int64_t> dilation = aten_max_pool_with_indices_onnx.attribute("dilation");
+        std::vector<int64_t> stride = aten_max_pool_with_indices_onnx.attribute("stride");
+        std::vector<int64_t> padding = aten_max_pool_with_indices_onnx.attribute("padding");
+        int64_t ceil_mode = aten_max_pool_with_indices_onnx.attribute("ceil_mode");
+
+        if (padding.size() == 6)
+        {
+            padding = {padding[0], padding[1], padding[2]};
+        }
+
+        op->params["kernel_size"] = kernel_size;
+        op->params["dilation"] = dilation;
+        op->params["stride"] = stride;
+        op->params["padding"] = padding;
+        op->params["ceil_mode"] = (ceil_mode != 0);
+        op->params["return_indices"] = (function.function.output_size() != 1);
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(MaxPool3d)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp b/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp
new file mode 100644
index 00000000000..a29ec9d9306
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_onnx.h"
+#include "ir.h"
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+class MultiheadAttention : public FuseFunctionPass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "nn.MultiheadAttention";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.MultiheadAttention";
+    }
+
+    void write(Operator* op, const OnnxFunctionProxy& function) const
+    {
+        const OnnxNodeProxy attention_scale = function.typed_node("_attention_scale");
+
+        const OnnxNodeProxy reshape_heads = function.find_producer(attention_scale.node.input(0));
+
+        const OnnxNodeProxy constant_shape = function.find_producer(reshape_heads.node.input(1));
+
+        if (constant_shape.node.op_type() == "Constant")
+        {
+            std::vector<int64_t> shape = constant_shape.attribute("value");
+            op->params["num_heads"] = shape[1];
+        }
+
+        const OnnxNodeProxy transpose = function.typed_node("Transpose");
+        std::vector<int64_t> perm = transpose.attribute("perm");
+        if (perm == std::vector<int64_t> {1, 0, 2})
+        {
+            op->params["batch_first"] = true;
+        }
+        else
+        {
+            op->params["batch_first"] = false;
+        }
+
+        op->params["add_zero_attn"] = false; // TODO
+
+        if (function.has_typed_node("_aten_scaled_dot_product_attention_no_mask_onnx"))
+        {
+            // TODO handle attn_mask
+        }
+
+        if (function.has_initializer("in_proj_weight"))
+        {
+            const onnx::TensorProto& in_proj_weight = function.initializer("in_proj_weight");
+
+            op->params["embed_dim"] = in_proj_weight.dims(1);
+            op->params["kdim"] = in_proj_weight.dims(1);
+            op->params["vdim"] = in_proj_weight.dims(1);
+            op->attrs["in_proj_weight"] = in_proj_weight;
+        }
+        else
+        {
+            const onnx::TensorProto& q_proj_weight = function.initializer("q_proj_weight");
+            const onnx::TensorProto& k_proj_weight = function.initializer("k_proj_weight");
+            const onnx::TensorProto& v_proj_weight = function.initializer("v_proj_weight");
+
+            op->params["embed_dim"] = q_proj_weight.dims(1);
+            op->params["kdim"] = k_proj_weight.dims(1);
+            op->params["vdim"] = v_proj_weight.dims(1);
+            op->attrs["q_proj_weight"] = q_proj_weight;
+            op->attrs["k_proj_weight"] = k_proj_weight;
+            op->attrs["v_proj_weight"] = v_proj_weight;
+        }
+
+        op->attrs["out_proj.weight"] = function.initializer("weight");
+
+        if (function.has_initializer("in_proj_bias") && function.has_initializer("bias"))
+        {
+            op->params["bias"] = true;
+            op->attrs["in_proj_bias"] = function.initializer("in_proj_bias");
+            op->attrs["out_proj.bias"] = function.initializer("bias");
+        }
+        else
+        {
+            op->params["bias"] = false;
+        }
+
+        if (function.has_initializer("bias_k") && function.has_initializer("bias_v"))
+        {
+            op->params["add_bias_kv"] = true;
+            op->attrs["bias_k"] = function.initializer("bias_k");
+            op->attrs["bias_v"] = function.initializer("bias_v");
+        }
+        else
+        {
+            op->params["add_bias_kv"] = false;
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_FUNCTION_PASS(MultiheadAttention)
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/shape_inference.cpp b/tools/pnnx/src/pass_onnx/shape_inference.cpp
new file mode 100644
index 00000000000..fb72e6b8513
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/shape_inference.cpp
@@ -0,0 +1,340 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "shape_inference.h"
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <onnxruntime_c_api.h>
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+static bool string_starts_with(const std::string& s, const std::string& s2)
+{
+    return strncmp(s.c_str(), s2.c_str(), s2.size()) == 0;
+}
+
+void shape_inference(onnx::ModelProto& model)
+{
+    onnx::GraphProto* graph = model.mutable_graph();
+
+    // save original outputs
+    std::vector<std::string> orig_outputs;
+    {
+        for (int i = 0; i < graph->output_size(); i++)
+        {
+            orig_outputs.push_back(graph->output(i).name());
+        }
+    }
+
+    // collect intermediates
+    std::vector<std::string> intermediates;
+    {
+        for (int i = 0; i < graph->node_size(); i++)
+        {
+            const onnx::NodeProto& node = graph->node(i);
+
+            const std::string& op_type = node.op_type();
+
+            // blacklist some glues
+            if (op_type == "Constant")
+                continue;
+
+            // TODO fuse cat
+            if (op_type == "SequenceConstruct")
+                continue;
+
+            // TODO fuse chunk/tensor_split
+            if (op_type == "aten_split")
+                continue;
+
+            if (node.domain().empty() || string_starts_with(op_type, "nn_") || string_starts_with(op_type, "aten_") || string_starts_with(op_type, "_aten_"))
+            {
+                for (int j = 0; j < node.output_size(); j++)
+                {
+                    // some input/output may have empty name, it causes trouble, skip it
+                    if (node.output(j).empty())
+                        continue;
+
+                    intermediates.push_back(node.output(j));
+                }
+            }
+        }
+    }
+
+    // add intermediates to onnx output
+    {
+        graph->clear_output();
+
+        for (size_t i = 0; i < intermediates.size(); i++)
+        {
+            graph->add_output()->set_name(intermediates[i]);
+        }
+    }
+
+    // generate temp onnx graph
+    std::string tmp_onnx_data;
+    {
+        std::stringstream tmp_onnx_data_ss;
+        if (!model.SerializeToOstream(&tmp_onnx_data_ss))
+        {
+            fprintf(stderr, "write onnx failed\n");
+            return;
+        }
+
+        tmp_onnx_data = tmp_onnx_data_ss.str();
+    }
+
+    // onnxrt inference
+    {
+        const OrtApi* ort_api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+
+        OrtStatus* ort_status = 0;
+
+        OrtEnv* ort_env = 0;
+        ort_status = ort_api->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "pnnx", &ort_env);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort CreateEnv failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        OrtSessionOptions* ort_session_opt = 0;
+        ort_status = ort_api->CreateSessionOptions(&ort_session_opt);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort CreateSessionOptions failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        ort_status = ort_api->SetSessionGraphOptimizationLevel(ort_session_opt, ORT_DISABLE_ALL);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort SetSessionGraphOptimizationLevel failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        // ort_status = ort_api->SetIntraOpNumThreads(ort_session_opt, 4);
+        // if (ort_status)
+        // {
+        //     fprintf(stderr, "ort SetIntraOpNumThreads failed %s\n", ort_api->GetErrorMessage(ort_status));
+        // }
+        //
+        // ort_status = ort_api->SetInterOpNumThreads(ort_session_opt, 4);
+        // if (ort_status)
+        // {
+        //     fprintf(stderr, "ort SetInterOpNumThreads failed %s\n", ort_api->GetErrorMessage(ort_status));
+        // }
+
+        OrtSession* ort_session = 0;
+        ort_status = ort_api->CreateSessionFromArray(ort_env, (const void*)tmp_onnx_data.data(), tmp_onnx_data.size(), ort_session_opt, &ort_session);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort CreateSession failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        OrtRunOptions* ort_run_opt = 0;
+        ort_status = ort_api->CreateRunOptions(&ort_run_opt);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort CreateRunOptions failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        OrtAllocator* ort_allocator = 0;
+        ort_status = ort_api->GetAllocatorWithDefaultOptions(&ort_allocator);
+        if (ort_status)
+        {
+            fprintf(stderr, "ort GetAllocatorWithDefaultOptions failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        std::vector<const char*> input_names;
+        std::vector<OrtValue*> inputs;
+        for (int i = 0; i < graph->input_size(); i++)
+        {
+            const onnx::ValueInfoProto& value = graph->input(i);
+
+            std::vector<int64_t> shape;
+            const onnx::TensorShapeProto& tsp = value.type().tensor_type().shape();
+            for (int k = 0; k < tsp.dim_size(); k++)
+            {
+                // TODO has_dim_value ?
+                shape.push_back(tsp.dim(k).dim_value());
+            }
+
+            ONNXTensorElementDataType datatype = (ONNXTensorElementDataType)value.type().tensor_type().elem_type();
+
+            OrtValue* ort_val = 0;
+            ort_status = ort_api->CreateTensorAsOrtValue(ort_allocator, (const int64_t*)shape.data(), shape.size(), datatype, &ort_val);
+            if (ort_status)
+            {
+                fprintf(stderr, "ort CreateTensorAsOrtValue failed %s\n", ort_api->GetErrorMessage(ort_status));
+            }
+
+            input_names.push_back(value.name().c_str());
+            inputs.push_back(ort_val);
+        }
+
+        std::vector<const char*> output_names;
+        std::vector<OrtValue*> outputs;
+        for (size_t i = 0; i < intermediates.size(); i++)
+        {
+            output_names.push_back(intermediates[i].c_str());
+            outputs.push_back(0);
+        }
+
+        ort_status = ort_api->Run(ort_session, ort_run_opt,
+                                  input_names.data(), inputs.data(), input_names.size(),
+                                  output_names.data(), output_names.size(), outputs.data());
+        if (ort_status)
+        {
+            fprintf(stderr, "ort Run failed %s\n", ort_api->GetErrorMessage(ort_status));
+        }
+
+        // TODO get output data
+
+        graph->clear_output();
+
+        for (size_t i = 0; i < output_names.size(); i++)
+        {
+            OrtTypeInfo* type_info = 0;
+            ort_status = ort_api->GetTypeInfo(outputs[i], &type_info);
+            if (ort_status)
+            {
+                fprintf(stderr, "ort GetTypeInfo failed %s\n", ort_api->GetErrorMessage(ort_status));
+            }
+
+            ONNXType type = ONNX_TYPE_UNKNOWN;
+            if (type_info)
+            {
+                ort_status = ort_api->GetOnnxTypeFromTypeInfo(type_info, &type);
+                if (ort_status)
+                {
+                    fprintf(stderr, "ort GetOnnxTypeFromTypeInfo failed %s\n", ort_api->GetErrorMessage(ort_status));
+                }
+            }
+
+            if (type == ONNX_TYPE_TENSOR)
+            {
+                OrtTensorTypeAndShapeInfo* info = 0;
+                ort_status = ort_api->GetTensorTypeAndShape(outputs[i], &info);
+                if (ort_status)
+                {
+                    fprintf(stderr, "ort GetTensorTypeAndShape failed %s\n", ort_api->GetErrorMessage(ort_status));
+                }
+
+                ONNXTensorElementDataType datatype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+                ort_status = ort_api->GetTensorElementType(info, &datatype);
+                if (ort_status)
+                {
+                    fprintf(stderr, "ort GetTensorElementType failed %s\n", ort_api->GetErrorMessage(ort_status));
+                }
+
+                size_t out_dims = 0;
+                ort_status = ort_api->GetDimensionsCount(info, &out_dims);
+                if (ort_status)
+                {
+                    fprintf(stderr, "ort GetDimensionsCount failed %s\n", ort_api->GetErrorMessage(ort_status));
+                }
+
+                // fprintf(stderr, "   out_dims = %lu\n", out_dims);
+
+                std::vector<int64_t> out_shape;
+                out_shape.resize(out_dims);
+                ort_status = ort_api->GetDimensions(info, out_shape.data(), out_dims);
+                if (ort_status)
+                {
+                    fprintf(stderr, "ort GetDimensions failed %s\n", ort_api->GetErrorMessage(ort_status));
+                }
+
+                // fprintf(stderr, "%16s = ", output_names[i]);
+                // for (size_t j = 0; j < out_dims; j++)
+                // {
+                //     fprintf(stderr, "%lu ", out_shape[j]);
+                // }
+                // fprintf(stderr, "\n");
+
+                // assign value info
+                {
+                    onnx::ValueInfoProto* value = 0;
+
+                    // maybe output
+                    for (size_t j = 0; j < orig_outputs.size(); j++)
+                    {
+                        if (orig_outputs[j] == output_names[i])
+                        {
+                            value = graph->add_output();
+                            value->set_name(output_names[i]);
+                            break;
+                        }
+                    }
+                    if (!value)
+                    {
+                        for (int j = 0; j < graph->value_info_size(); j++)
+                        {
+                            if (graph->mutable_value_info(j)->name() == output_names[i])
+                            {
+                                value = graph->mutable_value_info(j);
+                                break;
+                            }
+                        }
+                        if (!value)
+                        {
+                            value = graph->add_value_info();
+                            value->set_name(output_names[i]);
+                        }
+                    }
+
+                    // fprintf(stderr, "assign value info %s\n", value->name().c_str());
+
+                    value->mutable_type()->mutable_tensor_type()->set_elem_type((int32_t)datatype);
+
+                    onnx::TensorShapeProto* tsp = value->mutable_type()->mutable_tensor_type()->mutable_shape();
+
+                    tsp->clear_dim();
+                    for (size_t j = 0; j < out_dims; j++)
+                    {
+                        tsp->add_dim()->set_dim_value(out_shape[j]);
+                    }
+                }
+
+                ort_api->ReleaseTensorTypeAndShapeInfo(info);
+            }
+
+            if (type_info)
+            {
+                ort_api->ReleaseTypeInfo(type_info);
+            }
+        }
+
+        for (size_t i = 0; i < input_names.size(); i++)
+        {
+            ort_api->ReleaseValue(inputs[i]);
+        }
+
+        for (size_t i = 0; i < output_names.size(); i++)
+        {
+            ort_api->ReleaseValue(outputs[i]);
+        }
+
+        ort_api->ReleaseRunOptions(ort_run_opt);
+        ort_api->ReleaseSession(ort_session);
+        ort_api->ReleaseSessionOptions(ort_session_opt);
+        ort_api->ReleaseEnv(ort_env);
+    }
+}
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/shape_inference.h b/tools/pnnx/src/pass_onnx/shape_inference.h
new file mode 100644
index 00000000000..ea87333451d
--- /dev/null
+++ b/tools/pnnx/src/pass_onnx/shape_inference.h
@@ -0,0 +1,25 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "onnx.pb.h"
+
+namespace pnnx {
+
+namespace onnx2pnnx {
+
+void shape_inference(onnx::ModelProto& model);
+
+} // namespace onnx2pnnx
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/save_onnx.cpp b/tools/pnnx/src/save_onnx.cpp
index 12995980215..3406c730b2d 100644
--- a/tools/pnnx/src/save_onnx.cpp
+++ b/tools/pnnx/src/save_onnx.cpp
@@ -24,17 +24,6 @@
 
 namespace pnnx {
 
-// from cxxabi bridge
-extern const char* get_operand_name(const Operand* x);
-extern const char* get_operator_type(const Operator* op);
-extern const char* get_operator_name(const Operator* op);
-extern std::vector<const char*> get_operator_params_keys(const Operator* op);
-extern std::vector<const char*> get_operator_attrs_keys(const Operator* op);
-extern const Parameter& get_operator_param(const Operator* op, const char* key);
-extern const Attribute& get_operator_attr(const Operator* op, const char* key);
-extern const char* get_param_s(const Parameter& p);
-extern std::vector<const char*> get_param_as(const Parameter& p);
-
 int save_onnx(const Graph& g, const char* onnxpath, int fp16)
 {
     onnx::ModelProto model;
@@ -45,7 +34,7 @@ int save_onnx(const Graph& g, const char* onnxpath, int fp16)
     {
         onnx::ValueInfoProto* vip = gp->add_value_info();
 
-        vip->set_name(get_operand_name(x));
+        vip->set_name(x->name);
 
         onnx::TypeProto* tp = vip->mutable_type();
 
@@ -108,27 +97,26 @@ int save_onnx(const Graph& g, const char* onnxpath, int fp16)
     {
         onnx::NodeProto* np = gp->add_node();
 
-        np->set_op_type(get_operator_type(op));
-        np->set_name(get_operator_name(op));
+        np->set_op_type(op->type);
+        np->set_name(op->name);
 
         for (const Operand* oprand : op->inputs)
         {
-            np->add_input(get_operand_name(oprand));
+            np->add_input(oprand->name);
         }
 
         for (const Operand* oprand : op->outputs)
         {
-            np->add_output(get_operand_name(oprand));
+            np->add_output(oprand->name);
         }
 
-        std::vector<const char*> params_keys = get_operator_params_keys(op);
-        for (const char* param_name : params_keys)
+        for (const auto& it : op->params)
         {
-            const Parameter& param = get_operator_param(op, param_name);
+            const Parameter& param = it.second;
 
             onnx::AttributeProto* ap = np->add_attribute();
 
-            ap->set_name(param_name);
+            ap->set_name(it.first);
 
             if (param.type == 0)
             {
@@ -156,7 +144,7 @@ int save_onnx(const Graph& g, const char* onnxpath, int fp16)
             if (param.type == 4)
             {
                 ap->set_type(onnx::AttributeProto::STRING);
-                ap->set_s(get_param_s(param));
+                ap->set_s(param.s);
             }
             if (param.type == 5)
             {
@@ -177,24 +165,22 @@ int save_onnx(const Graph& g, const char* onnxpath, int fp16)
             if (param.type == 7)
             {
                 ap->set_type(onnx::AttributeProto::STRINGS);
-                std::vector<const char*> as = get_param_as(param);
-                for (auto s : as)
+                for (auto s : param.as)
                 {
                     ap->add_strings(s);
                 }
             }
         }
 
-        std::vector<const char*> attrs_keys = get_operator_attrs_keys(op);
-        for (const char* attr_name : attrs_keys)
+        for (const auto& it : op->attrs)
         {
             onnx::TensorProto* tp = gp->add_initializer();
 
-            tp->set_name(std::string(get_operator_name(op)) + "." + attr_name);
+            tp->set_name(op->name + "." + it.first);
 
-            np->add_input(std::string(get_operator_name(op)) + "." + attr_name);
+            np->add_input(op->name + "." + it.first);
 
-            const Attribute& attr = get_operator_attr(op, attr_name);
+            const Attribute& attr = it.second;
             for (auto s : attr.shape)
             {
                 tp->add_dims(s);
diff --git a/tools/pnnx/src/save_onnx_cxxabi_bridge.cpp b/tools/pnnx/src/save_onnx_cxxabi_bridge.cpp
deleted file mode 100644
index b74f2ab7a72..00000000000
--- a/tools/pnnx/src/save_onnx_cxxabi_bridge.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#include "ir.h"
-
-namespace pnnx {
-
-const char* get_operand_name(const Operand* x)
-{
-    return x->name.c_str();
-}
-
-const char* get_operator_type(const Operator* op)
-{
-    return op->type.c_str();
-}
-
-const char* get_operator_name(const Operator* op)
-{
-    return op->name.c_str();
-}
-
-std::vector<const char*> get_operator_params_keys(const Operator* op)
-{
-    std::vector<const char*> keys;
-    for (const auto& it : op->params)
-    {
-        const std::string& key = it.first;
-        keys.push_back(key.c_str());
-    }
-    return keys;
-}
-
-std::vector<const char*> get_operator_attrs_keys(const Operator* op)
-{
-    std::vector<const char*> keys;
-    for (const auto& it : op->attrs)
-    {
-        const std::string& key = it.first;
-        keys.push_back(key.c_str());
-    }
-    return keys;
-}
-
-const Parameter& get_operator_param(const Operator* op, const char* key)
-{
-    return op->params.at(key);
-}
-
-const Attribute& get_operator_attr(const Operator* op, const char* key)
-{
-    return op->attrs.at(key);
-}
-
-const char* get_param_s(const Parameter& p)
-{
-    return p.s.c_str();
-}
-
-std::vector<const char*> get_param_as(const Parameter& p)
-{
-    std::vector<const char*> as;
-    for (const auto& s : p.as)
-    {
-        as.push_back(s.c_str());
-    }
-    return as;
-}
-
-} // namespace pnnx
diff --git a/tools/pnnx/src/storezip.cpp b/tools/pnnx/src/storezip.cpp
index 8722c591b36..6a7747b19e0 100644
--- a/tools/pnnx/src/storezip.cpp
+++ b/tools/pnnx/src/storezip.cpp
@@ -42,6 +42,13 @@ PACK(struct local_file_header {
     uint16_t extra_field_length;
 });
 
+PACK(struct zip64_extended_extra_field {
+    uint64_t uncompressed_size;
+    uint64_t compressed_size;
+    uint64_t lfh_offset;
+    uint32_t disk_number;
+});
+
 PACK(struct central_directory_file_header {
     uint16_t version_made;
     uint16_t version;
@@ -61,6 +68,24 @@ PACK(struct central_directory_file_header {
     uint32_t lfh_offset;
 });
 
+PACK(struct zip64_end_of_central_directory_record {
+    uint64_t size_of_eocd64_m12;
+    uint16_t version_made_by;
+    uint16_t version_min_required;
+    uint32_t disk_number;
+    uint32_t start_disk;
+    uint64_t cd_records;
+    uint64_t total_cd_records;
+    uint64_t cd_size;
+    uint64_t cd_offset;
+});
+
+PACK(struct zip64_end_of_central_directory_locator {
+    uint32_t eocdr64_disk_number;
+    uint64_t eocdr64_offset;
+    uint32_t disk_count;
+});
+
 PACK(struct end_of_central_directory_record {
     uint16_t disk_number;
     uint16_t start_disk;
@@ -94,11 +119,11 @@ static uint32_t CRC32(uint32_t x, unsigned char ch)
     return (x >> 8) ^ CRC32_TABLE[(x ^ ch) & 0xff];
 }
 
-static uint32_t CRC32_buffer(const unsigned char* data, int len)
+static uint32_t CRC32_buffer(const unsigned char* data, uint64_t len)
 {
     uint32_t x = 0xffffffff;
 
-    for (int i = 0; i < len; i++)
+    for (uint64_t i = 0; i < len; i++)
         x = CRC32(x, data[i]);
 
     return x ^ 0xffffffff;
@@ -133,6 +158,8 @@ int StoreZipReader::open(const std::string& path)
         if (nread != 1)
             break;
 
+        // fprintf(stderr, "signature = %x\n", signature);
+
         if (signature == 0x04034b50)
         {
             local_file_header lfh;
@@ -155,18 +182,52 @@ int StoreZipReader::open(const std::string& path)
             name.resize(lfh.file_name_length);
             fread((char*)name.data(), name.size(), 1, fp);
 
-            // skip extra field
-            fseek(fp, lfh.extra_field_length, SEEK_CUR);
+            uint64_t compressed_size = lfh.compressed_size;
+            uint64_t uncompressed_size = lfh.uncompressed_size;
+            if (compressed_size == 0xffffffff && uncompressed_size == 0xffffffff)
+            {
+                uint16_t extra_offset = 0;
+                while (extra_offset < lfh.extra_field_length)
+                {
+                    uint16_t extra_id;
+                    uint16_t extra_size;
+                    fread((char*)&extra_id, sizeof(extra_id), 1, fp);
+                    fread((char*)&extra_size, sizeof(extra_size), 1, fp);
+                    if (extra_id != 0x0001)
+                    {
+                        // skip this extra field block
+                        fseek(fp, extra_size - 4, SEEK_CUR);
+                        extra_offset += extra_size;
+                        continue;
+                    }
+
+                    // zip64 extra field
+                    zip64_extended_extra_field zip64_eef;
+                    fread((char*)&zip64_eef, sizeof(zip64_eef), 1, fp);
+
+                    compressed_size = zip64_eef.compressed_size;
+                    uncompressed_size = zip64_eef.uncompressed_size;
+
+                    // skip remaining extra field blocks
+                    fseek(fp, lfh.extra_field_length - extra_offset - 4 - sizeof(zip64_eef), SEEK_CUR);
+                    break;
+                }
+            }
+            else
+            {
+                // skip extra field
+                fseek(fp, lfh.extra_field_length, SEEK_CUR);
+            }
 
             StoreZipMeta fm;
             fm.offset = ftell(fp);
-            fm.size = lfh.compressed_size;
+            fm.size = compressed_size;
 
             filemetas[name] = fm;
 
-            //             fprintf(stderr, "%s = %d  %d\n", name.c_str(), fm.offset, fm.size);
+            // fprintf(stderr, "%s = %d  %d\n", name.c_str(), fm.offset, fm.size);
 
-            fseek(fp, lfh.compressed_size, SEEK_CUR);
+            fseek(fp, compressed_size, SEEK_CUR);
         }
         else if (signature == 0x02014b50)
         {
@@ -190,6 +251,19 @@ int StoreZipReader::open(const std::string& path)
             // skip comment
             fseek(fp, eocdr.comment_length, SEEK_CUR);
         }
+        else if (signature == 0x06064b50)
+        {
+            zip64_end_of_central_directory_record eocdr64;
+            fread((char*)&eocdr64, sizeof(eocdr64), 1, fp);
+
+            // skip comment
+            fseek(fp, eocdr64.size_of_eocd64_m12 - 44, SEEK_CUR);
+        }
+        else if (signature == 0x07064b50)
+        {
+            zip64_end_of_central_directory_locator eocdl64;
+            fread((char*)&eocdl64, sizeof(eocdl64), 1, fp);
+        }
         else
         {
             fprintf(stderr, "unsupported signature %x\n", signature);
@@ -200,7 +274,18 @@ int StoreZipReader::open(const std::string& path)
     return 0;
 }
 
-size_t StoreZipReader::get_file_size(const std::string& name)
+std::vector<std::string> StoreZipReader::get_names() const
+{
+    std::vector<std::string> names;
+    for (std::map<std::string, StoreZipMeta>::const_iterator it = filemetas.begin(); it != filemetas.end(); ++it)
+    {
+        names.push_back(it->first);
+    }
+
+    return names;
+}
+
+uint64_t StoreZipReader::get_file_size(const std::string& name) const
 {
     if (filemetas.find(name) == filemetas.end())
     {
@@ -208,7 +293,7 @@ size_t StoreZipReader::get_file_size(const std::string& name)
         return 0;
     }
 
-    return filemetas[name].size;
+    return filemetas.at(name).size;
 }
 
 int StoreZipReader::read_file(const std::string& name, char* data)
@@ -219,8 +304,8 @@ int StoreZipReader::read_file(const std::string& name, char* data)
         return -1;
     }
 
-    size_t offset = filemetas[name].offset;
-    size_t size = filemetas[name].size;
+    uint64_t offset = filemetas[name].offset;
+    uint64_t size = filemetas[name].size;
 
     fseek(fp, offset, SEEK_SET);
     fread(data, size, 1, fp);
@@ -265,9 +350,9 @@ int StoreZipWriter::open(const std::string& path)
     return 0;
 }
 
-int StoreZipWriter::write_file(const std::string& name, const char* data, size_t size)
+int StoreZipWriter::write_file(const std::string& name, const char* data, uint64_t size)
 {
-    int offset = ftell(fp);
+    long offset = ftell(fp);
 
     uint32_t signature = 0x04034b50;
     fwrite((char*)&signature, sizeof(signature), 1, fp);
@@ -281,15 +366,30 @@ int StoreZipWriter::write_file(const std::string& name, const char* data, size_t
     lfh.last_modify_time = 0;
     lfh.last_modify_date = 0;
     lfh.crc32 = crc32;
-    lfh.compressed_size = size;
-    lfh.uncompressed_size = size;
+    lfh.compressed_size = 0xffffffff;
+    lfh.uncompressed_size = 0xffffffff;
     lfh.file_name_length = name.size();
-    lfh.extra_field_length = 0;
+
+    // zip64 extra field
+    zip64_extended_extra_field zip64_eef;
+    zip64_eef.uncompressed_size = size;
+    zip64_eef.compressed_size = size;
+    zip64_eef.lfh_offset = 0;
+    zip64_eef.disk_number = 0;
+
+    uint16_t extra_id = 0x0001;
+    uint16_t extra_size = sizeof(zip64_eef);
+
+    lfh.extra_field_length = sizeof(extra_id) + sizeof(extra_size) + sizeof(zip64_eef);
 
     fwrite((char*)&lfh, sizeof(lfh), 1, fp);
 
     fwrite((char*)name.c_str(), name.size(), 1, fp);
 
+    fwrite((char*)&extra_id, sizeof(extra_id), 1, fp);
+    fwrite((char*)&extra_size, sizeof(extra_size), 1, fp);
+    fwrite((char*)&zip64_eef, sizeof(zip64_eef), 1, fp);
+
     fwrite(data, size, 1, fp);
 
     StoreZipMeta szm;
@@ -308,7 +408,7 @@ int StoreZipWriter::close()
     if (!fp)
         return 0;
 
-    int offset = ftell(fp);
+    long offset = ftell(fp);
 
     for (const StoreZipMeta& szm : filemetas)
     {
@@ -323,34 +423,79 @@ int StoreZipWriter::close()
         cdfh.last_modify_time = 0;
         cdfh.last_modify_date = 0;
         cdfh.crc32 = szm.crc32;
-        cdfh.compressed_size = szm.size;
-        cdfh.uncompressed_size = szm.size;
+        cdfh.compressed_size = 0xffffffff;
+        cdfh.uncompressed_size = 0xffffffff;
         cdfh.file_name_length = szm.name.size();
-        cdfh.extra_field_length = 0;
         cdfh.file_comment_length = 0;
-        cdfh.start_disk = 0;
+        cdfh.start_disk = 0xffff;
         cdfh.internal_file_attrs = 0;
         cdfh.external_file_attrs = 0;
-        cdfh.lfh_offset = szm.lfh_offset;
+        cdfh.lfh_offset = 0xffffffff;
+
+        // zip64 extra field
+        zip64_extended_extra_field zip64_eef;
+        zip64_eef.uncompressed_size = szm.size;
+        zip64_eef.compressed_size = szm.size;
+        zip64_eef.lfh_offset = szm.lfh_offset;
+        zip64_eef.disk_number = 0;
+
+        uint16_t extra_id = 0x0001;
+        uint16_t extra_size = sizeof(zip64_eef);
+
+        cdfh.extra_field_length = sizeof(extra_id) + sizeof(extra_size) + sizeof(zip64_eef);
 
         fwrite((char*)&cdfh, sizeof(cdfh), 1, fp);
 
         fwrite((char*)szm.name.c_str(), szm.name.size(), 1, fp);
+
+        fwrite((char*)&extra_id, sizeof(extra_id), 1, fp);
+        fwrite((char*)&extra_size, sizeof(extra_size), 1, fp);
+        fwrite((char*)&zip64_eef, sizeof(zip64_eef), 1, fp);
+    }
+
+    long offset2 = ftell(fp);
+
+    {
+        uint32_t signature = 0x06064b50;
+        fwrite((char*)&signature, sizeof(signature), 1, fp);
+
+        zip64_end_of_central_directory_record eocdr64;
+        eocdr64.size_of_eocd64_m12 = sizeof(eocdr64) - 8;
+        eocdr64.version_made_by = 0;
+        eocdr64.version_min_required = 0;
+        eocdr64.disk_number = 0;
+        eocdr64.start_disk = 0;
+        eocdr64.cd_records = filemetas.size();
+        eocdr64.total_cd_records = filemetas.size();
+        eocdr64.cd_size = offset2 - offset;
+        eocdr64.cd_offset = offset;
+
+        fwrite((char*)&eocdr64, sizeof(eocdr64), 1, fp);
     }
 
-    int offset2 = ftell(fp);
+    {
+        uint32_t signature = 0x07064b50;
+        fwrite((char*)&signature, sizeof(signature), 1, fp);
+
+        zip64_end_of_central_directory_locator eocdl64;
+        eocdl64.eocdr64_disk_number = 0;
+        eocdl64.eocdr64_offset = offset2;
+        eocdl64.disk_count = 1;
+
+        fwrite((char*)&eocdl64, sizeof(eocdl64), 1, fp);
+    }
 
     {
         uint32_t signature = 0x06054b50;
         fwrite((char*)&signature, sizeof(signature), 1, fp);
 
         end_of_central_directory_record eocdr;
-        eocdr.disk_number = 0;
-        eocdr.start_disk = 0;
-        eocdr.cd_records = filemetas.size();
-        eocdr.total_cd_records = filemetas.size();
-        eocdr.cd_size = offset2 - offset;
-        eocdr.cd_offset = offset;
+        eocdr.disk_number = 0xffff;
+        eocdr.start_disk = 0xffff;
+        eocdr.cd_records = 0xffff;
+        eocdr.total_cd_records = 0xffff;
+        eocdr.cd_size = 0xffffffff;
+        eocdr.cd_offset = 0xffffffff;
         eocdr.comment_length = 0;
 
         fwrite((char*)&eocdr, sizeof(eocdr), 1, fp);
@@ -367,28 +512,45 @@ int StoreZipWriter::close()
 #if 0
 int main()
 {
-    StoreZipReader sz;
+    using namespace pnnx;
 
-    sz.open("test.zip");
+    {
+        uint64_t len = 1*1024*1024*1024;
+        // uint64_t len = 1*1024*1024;
+        char* data1g = new char[len];
 
-    std::vector<float> data1;
-    sz.read_file("pnnx2.py", data1);
+        StoreZipWriter szw;
 
-    std::vector<float> data2;
-    sz.read_file("pnnx2.param", data2);
+        szw.open("szw.zip");
 
-    sz.close();
+        szw.write_file("a.py", data1g, len);
+        szw.write_file("b.param", data1g, 44);
+        szw.write_file("c.bin", data1g, len);
+        szw.write_file("d.txt", data1g, len);
+        szw.write_file("e.jpg", data1g, len);
+        szw.write_file("f.png", data1g, len);
 
+        szw.close();
 
-    StoreZipWriter szw;
+        delete[] data1g;
+    }
 
-    szw.open("szw.zip");
+    {
+        StoreZipReader sz;
+
+        sz.open("szw.zip");
+
+        std::vector<std::string> names = sz.get_names();
 
-    szw.write_file("a.py", data1);
-    szw.write_file("zzzz.param", data2);
+        for (size_t i = 0; i < names.size(); i++)
+        {
+            uint64_t size = sz.get_file_size(names[i]);
 
-    szw.close();
+            fprintf(stderr, "%s  %lu\n", names[i].c_str(), size);
+        }
 
+        sz.close();
+    }
 
     return 0;
 }
diff --git a/tools/pnnx/src/storezip.h b/tools/pnnx/src/storezip.h
index 85f3252d2b8..352d70b21b5 100644
--- a/tools/pnnx/src/storezip.h
+++ b/tools/pnnx/src/storezip.h
@@ -30,7 +30,9 @@ class StoreZipReader
 
     int open(const std::string& path);
 
-    size_t get_file_size(const std::string& name);
+    std::vector<std::string> get_names() const;
+
+    uint64_t get_file_size(const std::string& name) const;
 
     int read_file(const std::string& name, char* data);
 
@@ -41,8 +43,8 @@ class StoreZipReader
 
     struct StoreZipMeta
     {
-        size_t offset;
-        size_t size;
+        uint64_t offset;
+        uint64_t size;
     };
 
     std::map<std::string, StoreZipMeta> filemetas;
@@ -56,7 +58,7 @@ class StoreZipWriter
 
     int open(const std::string& path);
 
-    int write_file(const std::string& name, const char* data, size_t size);
+    int write_file(const std::string& name, const char* data, uint64_t size);
 
     int close();
 
@@ -66,9 +68,9 @@ class StoreZipWriter
     struct StoreZipMeta
     {
         std::string name;
-        size_t lfh_offset;
+        uint64_t lfh_offset;
         uint32_t crc32;
-        uint32_t size;
+        uint64_t size;
     };
 
     std::vector<StoreZipMeta> filemetas;
diff --git a/tools/pnnx/src/utils.cpp b/tools/pnnx/src/utils.cpp
index bfc8919c0d5..f7e52eebe16 100644
--- a/tools/pnnx/src/utils.cpp
+++ b/tools/pnnx/src/utils.cpp
@@ -16,17 +16,6 @@
 
 namespace pnnx {
 
-const torch::jit::Node* find_node_by_kind(const std::shared_ptr<torch::jit::Graph>& graph, const std::string& kind)
-{
-    for (const auto& n : graph->nodes())
-    {
-        if (n->kind().toDisplayString() == kind)
-            return n;
-    }
-
-    return 0;
-}
-
 unsigned short float32_to_float16(float value)
 {
     // 1 : 8 : 23
diff --git a/tools/pnnx/src/utils.h b/tools/pnnx/src/utils.h
index 1892d26a994..323d5ab48f6 100644
--- a/tools/pnnx/src/utils.h
+++ b/tools/pnnx/src/utils.h
@@ -15,12 +15,21 @@
 #ifndef PNNX_UTILS_H
 #define PNNX_UTILS_H
 
-#include <torch/script.h>
-#include <torch/csrc/jit/api/module.h>
+#if BUILD_TORCH2PNNX
+#include <memory>
+namespace torch {
+namespace jit {
+struct Graph;
+struct Node;
+} // namespace jit
+} // namespace torch
+#endif
 
 namespace pnnx {
 
+#if BUILD_TORCH2PNNX
 const torch::jit::Node* find_node_by_kind(const std::shared_ptr<torch::jit::Graph>& graph, const std::string& kind);
+#endif
 
 unsigned short float32_to_float16(float value);
 
diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt
index 5b383e987bd..87bde9415cb 100644
--- a/tools/pnnx/tests/CMakeLists.txt
+++ b/tools/pnnx/tests/CMakeLists.txt
@@ -184,6 +184,7 @@ pnnx_add_test(Tensor_view)
 pnnx_add_test(torch_addmm)
 pnnx_add_test(torch_amax)
 pnnx_add_test(torch_amin)
+pnnx_add_test(torch_arange)
 pnnx_add_test(torch_argmax)
 pnnx_add_test(torch_argmin)
 pnnx_add_test(torch_bitwise_not)
@@ -212,6 +213,7 @@ pnnx_add_test(torch_le)
 pnnx_add_test(torch_lgamma)
 pnnx_add_test(torch_logsumexp)
 pnnx_add_test(torch_lt)
+pnnx_add_test(torch_masked_select)
 pnnx_add_test(torch_matmul)
 pnnx_add_test(torch_max)
 pnnx_add_test(torch_mean)
@@ -346,6 +348,7 @@ if(Torch_VERSION VERSION_GREATER_EQUAL "1.10")
 endif()
 
 if(Torch_VERSION VERSION_GREATER_EQUAL "1.11")
+    pnnx_add_test(torch_slice_scatter)
     pnnx_add_test(torch_fft_ihfft2)
     pnnx_add_test(torch_fft_ihfftn)
     pnnx_add_test(torch_fft_hfft2)
diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt
index 04dcbeed63f..511b2097330 100644
--- a/tools/pnnx/tests/ncnn/CMakeLists.txt
+++ b/tools/pnnx/tests/ncnn/CMakeLists.txt
@@ -213,6 +213,10 @@ if(Torch_VERSION VERSION_GREATER_EQUAL "1.9")
     pnnx_ncnn_add_test(nn_Mish)
 endif()
 
+if(Torch_VERSION VERSION_GREATER_EQUAL "1.11")
+    pnnx_ncnn_add_test(torch_slice_scatter)
+endif()
+
 if(TorchVision_FOUND)
     pnnx_ncnn_add_test(torchvision_DeformConv2d)
 endif()
diff --git a/tools/pnnx/tests/ncnn/test_Tensor_slice_copy.py b/tools/pnnx/tests/ncnn/test_Tensor_slice_copy.py
index d5a68fc0cbd..c82a89fe573 100644
--- a/tools/pnnx/tests/ncnn/test_Tensor_slice_copy.py
+++ b/tools/pnnx/tests/ncnn/test_Tensor_slice_copy.py
@@ -25,12 +25,13 @@ def forward(self, x, y, z, w):
         y = y.clone()
         z = z.clone()
         w = w.clone()
-        x[2:10,...] += 1
+        xx = x[1]
         x[...,1] = x[...,-1] * 3
         x[:,:,3,:2].clamp_(0, 0.5)
         x[:,:,3,:2] = x[:,:,3,:2].exp_()
+        xx[2:4,...] += 1
         x[:,:,:,:] = x[:,:,:,:] / 2
-        y[...,1:2,-5:-1] = y[...,4:5,1:5] - 11
+        y[...,-1,-5:-1] = y[...,-4,1:5] - 11
         z[:1] = z[-1:] * z[3:4]
         w[100:] = w[4:24] + 23
         return x, y, z, w
diff --git a/tools/pnnx/tests/ncnn/test_torch_slice_scatter.py b/tools/pnnx/tests/ncnn/test_torch_slice_scatter.py
new file mode 100644
index 00000000000..39ea3670158
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_slice_scatter.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        if version.parse(torch.__version__) >= version.parse('1.13') and version.parse(torch.__version__) < version.parse('2.0'):
+            out0 = torch.slice_scatter(x, y, start=6, step=1)
+        else:
+            out0 = torch.slice_scatter(x, y, start=6)
+        out1 = torch.slice_scatter(x, z, dim=1, start=2, end=6, step=1)
+        return out0, out1
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(8, 8)
+    y = torch.rand(2, 8)
+    z = torch.rand(8, 4)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_slice_scatter.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_slice_scatter.pt inputshape=[8,8],[2,8],[8,4]")
+
+    # ncnn inference
+    import test_torch_slice_scatter_ncnn
+    b = test_torch_slice_scatter_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_torch_stack.py b/tools/pnnx/tests/ncnn/test_torch_stack.py
index 74d9e2a9ea3..a55ff5962c5 100644
--- a/tools/pnnx/tests/ncnn/test_torch_stack.py
+++ b/tools/pnnx/tests/ncnn/test_torch_stack.py
@@ -22,10 +22,14 @@ def __init__(self):
 
     def forward(self, x, y, z, w):
         out0 = torch.stack((x, y), dim=0)
-        out1 = torch.stack((z, w), dim=2)
+        out1 = torch.stack((x, y), dim=2)
+        out2 = torch.stack((z, w), dim=2)
+        out3 = torch.stack((z, w), dim=-1)
         out0.relu_()
         out1.relu_()
-        return out0, out1
+        out2.relu_()
+        out3.relu_()
+        return out0, out1, out2, out3
 
 def test():
     net = Model()
@@ -37,7 +41,7 @@ def test():
     z = torch.rand(5, 9, 3)
     w = torch.rand(5, 9, 3)
 
-    a0, a1 = net(x, y, z, w)
+    a = net(x, y, z, w)
 
     # export torchscript
     mod = torch.jit.trace(net, (x, y, z, w))
@@ -49,9 +53,12 @@ def test():
 
     # ncnn inference
     import test_torch_stack_ncnn
-    b0, b1 = test_torch_stack_ncnn.test_inference()
+    b = test_torch_stack_ncnn.test_inference()
 
-    return torch.equal(a0, b0) and torch.equal(a1, b1)
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
 
 if __name__ == "__main__":
     if test():
diff --git a/tools/pnnx/tests/test_Tensor_index_put.py b/tools/pnnx/tests/test_Tensor_index_put.py
index 968c785b0a9..0173ceb8aca 100644
--- a/tools/pnnx/tests/test_Tensor_index_put.py
+++ b/tools/pnnx/tests/test_Tensor_index_put.py
@@ -25,6 +25,9 @@ def forward(self, x, y, z, w):
         z = z.clone()
         x = x.index_put(indices=[torch.tensor([10,2])], values=y, accumulate=False)
         z.index_put_(indices=[torch.tensor([1,0,0]), torch.tensor([3,2,1])], values=w, accumulate=True)
+
+        x[torch.tensor([1], dtype=torch.int64)] = torch.tensor(45).float()
+        x[torch.tensor([], dtype=torch.int64)] = torch.tensor(233).float()
         return x, z
 
 def test():
diff --git a/tools/pnnx/tests/test_Tensor_new_empty.py b/tools/pnnx/tests/test_Tensor_new_empty.py
index bc75a0059c9..2e5ca2fbc80 100644
--- a/tools/pnnx/tests/test_Tensor_new_empty.py
+++ b/tools/pnnx/tests/test_Tensor_new_empty.py
@@ -25,7 +25,7 @@ def forward(self, x):
         out1 = x.new_empty(3)
         out2 = x.new_empty((4,5,6,7,8))
         out3 = x.new_empty((1,2,1))
-        out4 = x.new_empty((3,3,3,3))
+        out4 = x.new_empty((3,3,3,3), dtype=torch.long)
         return out0, out1, out2, out3, out4
 
 def test():
diff --git a/tools/pnnx/tests/test_Tensor_new_full.py b/tools/pnnx/tests/test_Tensor_new_full.py
index f6855f201a7..d756f77e5b9 100644
--- a/tools/pnnx/tests/test_Tensor_new_full.py
+++ b/tools/pnnx/tests/test_Tensor_new_full.py
@@ -25,7 +25,7 @@ def forward(self, x):
         out1 = x.new_full((3,), 3)
         out2 = x.new_full((4,5,6,7,8), -0.5)
         out3 = x.new_full((1,2,1), 0)
-        out4 = x.new_full((3,3,3,3), 1)
+        out4 = x.new_full((3,3,3,3), 1, dtype=torch.long)
         return out0, out1, out2, out3, out4
 
 def test():
diff --git a/tools/pnnx/tests/test_Tensor_new_zeros.py b/tools/pnnx/tests/test_Tensor_new_zeros.py
index abe87abbc4a..d1b4bbcc989 100644
--- a/tools/pnnx/tests/test_Tensor_new_zeros.py
+++ b/tools/pnnx/tests/test_Tensor_new_zeros.py
@@ -25,7 +25,7 @@ def forward(self, x):
         out1 = x.new_zeros(3)
         out2 = x.new_zeros((4,5,6,7,8))
         out3 = x.new_zeros((1,2,1))
-        out4 = x.new_zeros((3,3,3,3))
+        out4 = x.new_zeros((3,3,3,3), dtype=torch.long)
         return out0, out1, out2, out3, out4
 
 def test():
diff --git a/tools/pnnx/tests/test_Tensor_slice_copy.py b/tools/pnnx/tests/test_Tensor_slice_copy.py
index 96d9d47bb5d..211392c5979 100644
--- a/tools/pnnx/tests/test_Tensor_slice_copy.py
+++ b/tools/pnnx/tests/test_Tensor_slice_copy.py
@@ -25,14 +25,16 @@ def forward(self, x, y, z, w):
         y = y.clone()
         z = z.clone()
         w = w.clone()
+        xx = x[8]
         x[2:10,...] += 1
-        x[...,1] = x[...,-1] * 3
+        xx[...,1] = xx[...,-1] * 3
         x1 = x.clone()
+        xxx = x[2:-1,11,...]
         x[:,:,3,::2].clamp_(0, 0.5)
-        x[:,:,3,::2] = x[:,:,3,::2].exp_()
+        xx[:,3,::2] = xx[:,4,1::2].exp_()
         x[:,:,::2,:] = x1[:,:,::2,:].pow(2)
-        x[:,:,:,:] = x[:,:,:,:] / 2
-        y[...,1:2,-5:-1] = y[...,4:5,1:5] - 11
+        xxx[:,:,:] /= 2
+        y[...,-1,-5:-1] = y[...,-4,1:5] - 11
         z[:1] = z[-1:] * z[3:4]
         w[80::2] = w[4:84:4] + 23
         return x, y, z, w
diff --git a/tools/pnnx/tests/test_pnnx_fuse_slice_to_tensor_split.py b/tools/pnnx/tests/test_pnnx_fuse_slice_to_tensor_split.py
index 0db9b784948..2f5305d8870 100644
--- a/tools/pnnx/tests/test_pnnx_fuse_slice_to_tensor_split.py
+++ b/tools/pnnx/tests/test_pnnx_fuse_slice_to_tensor_split.py
@@ -24,7 +24,7 @@ def forward(self, x, y, z):
         x0 = x[:3]
         x1 = x[3:]
 
-        z3 = z[:,:,7:]
+        z3 = z[:,:,7:z.size(2)]
         z2 = z[:,:,4:7]
 
         y0 = y[:2,:]
diff --git a/tools/pnnx/tests/test_torch_arange.py b/tools/pnnx/tests/test_torch_arange.py
new file mode 100644
index 00000000000..176f8ed3381
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_arange.py
@@ -0,0 +1,59 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x):
+        out0 = torch.arange(x[0])
+        out1 = torch.arange(x[1], 33, dtype=None)
+        out2 = torch.arange(x[3], x[4], x[6] * 0.1, dtype=torch.float)
+        return out0, out1, out2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.randint(10, (16,), dtype=torch.int)
+
+    a = net(x)
+
+    # export torchscript
+    mod = torch.jit.trace(net, x)
+    mod.save("test_torch_arange.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_arange.pt inputshape=[16]i32")
+
+    # pnnx inference
+    import test_torch_arange_pnnx
+    b = test_torch_arange_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_masked_select.py b/tools/pnnx/tests/test_torch_masked_select.py
new file mode 100644
index 00000000000..e3f0130a778
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_masked_select.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.masked_select(x, x > 0.5)
+        y = torch.masked_select(y, y > 0.3)
+        z = torch.masked_select(z, z > 0.1)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_masked_select.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_masked_select.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_masked_select_pnnx
+    b = test_torch_masked_select_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_ones.py b/tools/pnnx/tests/test_torch_ones.py
index 2b453ad4d3a..94cd379d5ff 100644
--- a/tools/pnnx/tests/test_torch_ones.py
+++ b/tools/pnnx/tests/test_torch_ones.py
@@ -23,7 +23,7 @@ def __init__(self):
     def forward(self, x, y, z):
         x = torch.ones(x.size())
         y = torch.ones(y.size())
-        z = torch.ones(z.size())
+        z = torch.ones(z.size(), dtype=torch.long)
         return x, y, z
 
 def test():
diff --git a/tools/pnnx/tests/test_torch_ones_like.py b/tools/pnnx/tests/test_torch_ones_like.py
index ed3608c2b5e..c30132060da 100644
--- a/tools/pnnx/tests/test_torch_ones_like.py
+++ b/tools/pnnx/tests/test_torch_ones_like.py
@@ -23,7 +23,7 @@ def __init__(self):
     def forward(self, x, y, z):
         x = torch.ones_like(x)
         y = torch.ones_like(y)
-        z = torch.ones_like(z)
+        z = torch.ones_like(z, dtype=torch.long)
         return x, y, z
 
 def test():
diff --git a/tools/pnnx/tests/test_torch_slice_scatter.py b/tools/pnnx/tests/test_torch_slice_scatter.py
new file mode 100644
index 00000000000..d12eec6ea9d
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_slice_scatter.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        if version.parse(torch.__version__) >= version.parse('1.13') and version.parse(torch.__version__) < version.parse('2.0'):
+            out0 = torch.slice_scatter(x, y, start=6, step=1)
+        else:
+            out0 = torch.slice_scatter(x, y, start=6)
+        out1 = torch.slice_scatter(x, z, dim=1, start=2, end=6, step=2)
+        return out0, out1
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(8, 8)
+    y = torch.rand(2, 8)
+    z = torch.rand(8, 2)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_slice_scatter.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_slice_scatter.pt inputshape=[8,8],[2,8],[8,2]")
+
+    # pnnx inference
+    import test_torch_slice_scatter_pnnx
+    b = test_torch_slice_scatter_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_stack.py b/tools/pnnx/tests/test_torch_stack.py
index 4caa10e218b..c25d8be9cf0 100644
--- a/tools/pnnx/tests/test_torch_stack.py
+++ b/tools/pnnx/tests/test_torch_stack.py
@@ -22,8 +22,10 @@ def __init__(self):
 
     def forward(self, x, y, z, w):
         out0 = torch.stack((x, y), dim=0)
-        out1 = torch.stack((z, w), dim=2)
-        return out0, out1
+        out1 = torch.stack((x, y), dim=2)
+        out2 = torch.stack((z, w), dim=2)
+        out3 = torch.stack((z, w), dim=-1)
+        return out0, out1, out2, out3
 
 def test():
     net = Model()
@@ -35,7 +37,7 @@ def test():
     z = torch.rand(5, 9, 3)
     w = torch.rand(5, 9, 3)
 
-    a0, a1 = net(x, y, z, w)
+    a = net(x, y, z, w)
 
     # export torchscript
     mod = torch.jit.trace(net, (x, y, z, w))
@@ -47,9 +49,12 @@ def test():
 
     # pnnx inference
     import test_torch_stack_pnnx
-    b0, b1 = test_torch_stack_pnnx.test_inference()
+    b = test_torch_stack_pnnx.test_inference()
 
-    return torch.equal(a0, b0) and torch.equal(a1, b1)
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
 
 if __name__ == "__main__":
     if test():
diff --git a/tools/pnnx/tests/test_torch_zeros.py b/tools/pnnx/tests/test_torch_zeros.py
index 0470c32a07a..d75707a57e7 100644
--- a/tools/pnnx/tests/test_torch_zeros.py
+++ b/tools/pnnx/tests/test_torch_zeros.py
@@ -23,7 +23,7 @@ def __init__(self):
     def forward(self, x, y, z):
         x = torch.zeros(x.size())
         y = torch.zeros(y.size())
-        z = torch.zeros(z.size())
+        z = torch.zeros(z.size(), dtype=torch.long)
         return x, y, z
 
 def test():
diff --git a/tools/pnnx/tests/test_torch_zeros_like.py b/tools/pnnx/tests/test_torch_zeros_like.py
index e9a2dc2df83..8f288367e04 100644
--- a/tools/pnnx/tests/test_torch_zeros_like.py
+++ b/tools/pnnx/tests/test_torch_zeros_like.py
@@ -23,7 +23,7 @@ def __init__(self):
     def forward(self, x, y, z):
         x = torch.zeros_like(x)
         y = torch.zeros_like(y)
-        z = torch.zeros_like(z)
+        z = torch.zeros_like(z, dtype=torch.long)
         return x, y, z
 
 def test():