diff --git a/.gitignore b/.gitignore index 259148f..3140741 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,9 @@ *.exe *.out *.app + +build/ +models/ +cmake-build-debug/ +cmake-build-release/ +.idea/ diff --git a/CMakeLists-win.txt b/CMakeLists-win.txt new file mode 100644 index 0000000..ea732c6 --- /dev/null +++ b/CMakeLists-win.txt @@ -0,0 +1,103 @@ +cmake_minimum_required(VERSION 3.28) +project(yolov8_trtx_v10) + +set(CMAKE_CXX_STANDARD 11) +# 设置nvcc编译cu文件时候使用utf-8编码 +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /utf-8") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8") + +enable_language(CUDA) + +# 设置cuda多个框架支持 +set(CMAKE_CUDA_ARCHITECTURES 75 86 89) +message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") + +# OpenCV +set(OpenCV_DIR E:\\Opencv\\install\\opencv-4.8.0\\build) +find_package(OpenCV REQUIRED) +include_directories(${OpenCV_INCLUDE_DIRS}) +link_directories(${OpenCV_LIB_DIR}) + +# CUDA +set(CUDA_TOOLKIT_ROOT_DIR C:\\Program\ Files\\NVIDIA\ GPU\ Computing\ Toolkit\\CUDA\\v11.8) +include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include) +link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) + +# TensorRT +#set(TENSORRT_ROOT E:\\TensorRT\\TensorRT-8.6.1.6) +set(TENSORRT_ROOT E:\\TensorRT\\TensorRT-10.2.0.19) +include_directories(${TENSORRT_ROOT}/include) +link_directories(${TENSORRT_ROOT}/lib) + +# 判断TENSORRT_ROOT路径中的version如果路径中第一个.前大于8 +# 获取所有版本文件 +file(GLOB TENSORRT_VERSION_FILES "${TENSORRT_ROOT}/include/NvInferVersion.h") +# 读取版本文件 +file(STRINGS ${TENSORRT_VERSION_FILES} TENSORRT_VERSION_LINES + LIMIT_COUNT 1 # 只读取第一行 + REGEX "#define NV_TENSORRT_MAJOR [0-9]+" # 匹配版本号定义行 +) +message(STATUS " TENSORRT_VERSION_LINES: ${TENSORRT_VERSION_LINES}") +# 解析版本号 +string(REGEX REPLACE "#define NV_TENSORRT_MAJOR ([0-9]+)" "\\1" TENSORRT_VERSION_MAJOR ${TENSORRT_VERSION_LINES}) +message(STATUS " TENSORRT_VERSION_MAJOR: ${TENSORRT_VERSION_MAJOR}") +# 判断版本号是否大于等于10 +if (TENSORRT_VERSION_MAJOR GREATER_EQUAL 10) + message(STATUS " TensorRT version is greater than or equal to 10.") + link_libraries( + opencv_core + opencv_highgui + opencv_imgproc + opencv_imgcodecs + cudart + cublas + nvinfer_10 + ) +else () + message(STATUS " TensorRT version is less than 10.") + link_libraries( + opencv_core + opencv_highgui + opencv_imgproc + opencv_imgcodecs + cudart + cublas + nvinfer + ) +endif () + +include_directories(${CMAKE_SOURCE_DIR}/include) +include_directories(${CMAKE_SOURCE_DIR}/plugin) +include_directories(${CMAKE_SOURCE_DIR}/src) +link_directories(${CMAKE_SOURCE_DIR}/lib) + +add_definitions(-DNOMINMAX) + +add_definitions(-DAPI_EXPORTS) + +file(GLOB_RECURSE SRCS ${CMAKE_SOURCE_DIR}/src/*.cpp ${CMAKE_SOURCE_DIR}/src/*.cu) +file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu) + +add_library(myplugins SHARED ${PLUGIN_SRCS}) +target_link_libraries(myplugins nvinfer_10 nvinfer_plugin_10 cudart) + +add_executable(yolov8_cls yolov8_cls.cpp ${SRCS}) +target_link_libraries(yolov8_cls myplugins) + +add_executable(yolov8_det yolov8_det.cpp ${SRCS}) +target_link_libraries(yolov8_det nvinfer_10) +target_link_libraries(yolov8_det cudart) +target_link_libraries(yolov8_det myplugins) +target_link_libraries(yolov8_det ${OpenCV_LIBS}) + +add_executable(yolov8_seg yolov8_seg.cpp ${SRCS}) +target_link_libraries(yolov8_seg nvinfer_10) +target_link_libraries(yolov8_seg cudart) +target_link_libraries(yolov8_seg myplugins) +target_link_libraries(yolov8_seg ${OpenCV_LIBS}) + +add_executable(yolov8_pose yolov8_pose.cpp ${SRCS}) +target_link_libraries(yolov8_pose nvinfer_10) +target_link_libraries(yolov8_pose cudart) +target_link_libraries(yolov8_pose myplugins) +target_link_libraries(yolov8_pose ${OpenCV_LIBS}) diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..f8b0aca --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,57 @@ +cmake_minimum_required(VERSION 3.10) + +project(yolov8) + +add_definitions(-std=c++11) +add_definitions(-DAPI_EXPORTS) +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_BUILD_TYPE Debug) + +set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) +enable_language(CUDA) + +include_directories(${PROJECT_SOURCE_DIR}/include) +include_directories(${PROJECT_SOURCE_DIR}/plugin) + +# include and link dirs of cuda and tensorrt, you need adapt them if yours are different +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + message("embed_platform on") + include_directories(/usr/local/cuda/targets/aarch64-linux/include) + link_directories(/usr/local/cuda/targets/aarch64-linux/lib) +else() + message("embed_platform off") + + # cuda + include_directories(/usr/local/cuda/include) + link_directories(/usr/local/cuda/lib64) + + # tensorrt + include_directories(/workspace/shared/TensorRT-10.2.0.19/include/) + link_directories(/workspace/shared/TensorRT-10.2.0.19/lib/) + + # include_directories(/home/lindsay/TensorRT-7.2.3.4/include) + # link_directories(/home/lindsay/TensorRT-7.2.3.4/lib) +endif() + +add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu) +target_link_libraries(myplugins nvinfer cudart) + +find_package(OpenCV) +include_directories(${OpenCV_INCLUDE_DIRS}) + +file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu) +add_executable(yolov8_det ${PROJECT_SOURCE_DIR}/yolov8_det.cpp ${SRCS}) + +target_link_libraries(yolov8_det nvinfer) +target_link_libraries(yolov8_det cudart) +target_link_libraries(yolov8_det myplugins) +target_link_libraries(yolov8_det ${OpenCV_LIBS}) + +add_executable(yolov8_seg ${PROJECT_SOURCE_DIR}/yolov8_seg.cpp ${SRCS}) +target_link_libraries(yolov8_seg nvinfer cudart myplugins ${OpenCV_LIBS}) + +add_executable(yolov8_pose ${PROJECT_SOURCE_DIR}/yolov8_pose.cpp ${SRCS}) +target_link_libraries(yolov8_pose nvinfer cudart myplugins ${OpenCV_LIBS}) + +add_executable(yolov8_cls ${PROJECT_SOURCE_DIR}/yolov8_cls.cpp ${SRCS}) +target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS}) diff --git a/README.md b/README.md new file mode 100644 index 0000000..ec08d80 --- /dev/null +++ b/README.md @@ -0,0 +1,143 @@ +## Introduce + +Yolov8 model supports TensorRT-10. + +## Environment + +CUDA: 11.8 +CUDNN: 8.9.1.23 +TensorRT: TensorRT-10.2.0.19 + +## Support + +* [x] YOLOv8-cls support FP32/FP16/INT8 and Python/C++ API +* [x] YOLOv8-det support FP32/FP16/INT8 and Python/C++ API +* [x] YOLOv8-seg support FP32/FP16/INT8 and Python/C++ API +* [x] YOLOv8-pose support FP32/FP16/INT8 and Python/C++ API + +## Config + +* Choose the YOLOv8 sub-model n/s/m/l/x/n6/s6/m6/l6/x6 from command line arguments. +* Other configs please check [src/config.h](src/config.h) + +## Build and Run + +1. generate .wts from pytorch with .pt, or download .wts from model zoo + +```shell +git clone https://gitclone.com/github.com/ultralytics/ultralytics.git +git clone https://github.com/mpj1234/YOLOv8-series-TensorRT10.git +cd YOLOv8-series-TensorRT10/ +wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n-cls.pt +wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt +wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n-seg.pt +wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n-pose.pt +cp [PATH-TO-YOLOv8-series-TensorRT10]/yolov8/gen_wts.py . +python gen_wts.py -w yolov8n-cls.pt -o yolov8n-cls.wts -t cls +python gen_wts.py -w yolov8n.pt -o yolov8n.wts +python gen_wts.py -w yolov8n-seg.pt -o yolov8n-seg.wts -t seg +python gen_wts.py -w yolov8n-pose.pt -o yolov8n-pose.wts -t pose +# A file 'yolov8n.wts' will be generated. +``` + +2. build YOLOv8-series-TensorRT10 and run + +#### Classification + +```shell +cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10 +# Update kNumClass in src/config.h if your model is trained on custom dataset +mkdir build +cd build +cp [PATH-TO-ultralytics-yolov8]/yolov8sn-cls.wts . +cmake .. +make + +# Download ImageNet labels +wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt + +# Build and serialize TensorRT engine +./yolov8_cls -s yolov8n-cls.wts yolov8n-cls.engine [n/s/m/l/x] + +# Run inference +./yolov8_cls -d yolov8n-cls.engine ../images +# The results are displayed in the console +``` + +3. Optional, load and run the tensorrt model in Python +```shell +// Install python-tensorrt, pycuda, etc. +// Ensure the yolov8n-cls.engine +python yolov8_cls_trt.py ./build/yolov8n-cls.engine ../images +# faq: in windows bug pycuda._driver.LogicError +# faq: in linux bug Segmentation fault +# Add the following code to the py file: +# import pycuda.autoinit +# import pycuda.driver as cuda +``` + +#### Detection + +```shell +cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10 +# Update kNumClass in src/config.h if your model is trained on custom dataset +mkdir build +cd build +cp [PATH-TO-ultralytics-yolov8]/yolov8n.wts . +cmake .. +make + +# Build and serialize TensorRT engine +./yolov8_det -s yolov8n.wts yolov8n.engine [n/s/m/l/x] + +# Run inference +./yolov8_det -d yolov8n.engine ../images [c/g] +# The results are displayed in the console +``` + +#### Segmentation + +```shell +cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10 +# Update kNumClass in src/config.h if your model is trained on custom dataset +mkdir build +cd build +cp [PATH-TO-ultralytics-yolov8]/yolov8n-seg.wts . +cmake .. +make + +# Build and serialize TensorRT engine +./yolov8_seg -s yolov8n-seg.wts yolov8n-seg.engine [n/s/m/l/x] + +# Download the labels file +wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt + +# Run inference +./yolov8_seg -d yolov8n-seg.engine ../images [c/g] coco.txt +# The results are displayed in the console +``` + +#### Pose + +```shell +cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10 +# Update kNumClass in src/config.h if your model is trained on custom dataset +mkdir build +cd build +cp [PATH-TO-ultralytics-yolov8]/yolov8n-pose.wts . +cmake .. +make + +# Build and serialize TensorRT engine +./yolov8_seg -s yolov8n-pose.wts yolov8n-pose.engine [n/s/m/l/x] + +# Run inference +./yolov8_seg -d yolov8n-seg.engine ../images c +# The results are displayed in the console +``` + +## INT8 Quantization +1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh +2. unzip it in yolov8_trt10/build +3. set the macro `USE_INT8` in src/config.h and make again +4. serialize the model and test diff --git a/gen_wts.py b/gen_wts.py new file mode 100644 index 0000000..5f037db --- /dev/null +++ b/gen_wts.py @@ -0,0 +1,57 @@ +import sys # noqa: F401 +import argparse +import os +import struct +import torch + + +def parse_args(): + parser = argparse.ArgumentParser(description='Convert .pt file to .wts') + parser.add_argument('-w', '--weights', required=True, + help='Input weights (.pt) file path (required)') + parser.add_argument( + '-o', '--output', help='Output (.wts) file path (optional)') + parser.add_argument( + '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose'], + help='determines the model is detection/classification') + args = parser.parse_args() + if not os.path.isfile(args.weights): + raise SystemExit('Invalid input file') + if not args.output: + args.output = os.path.splitext(args.weights)[0] + '.wts' + elif os.path.isdir(args.output): + args.output = os.path.join( + args.output, + os.path.splitext(os.path.basename(args.weights))[0] + '.wts') + return args.weights, args.output, args.type + + +pt_file, wts_file, m_type = parse_args() + +print(f'Generating .wts for {m_type} model') + +# Load model +print(f'Loading {pt_file}') + +# Initialize +device = 'cpu' + +# Load model +model = torch.load(pt_file, map_location=device)['model'].float() # load to FP32 + +if m_type in ['detect', 'seg', 'pose']: + anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None] + + delattr(model.model[-1], 'anchors') + +model.to(device).eval() + +with open(wts_file, 'w') as f: + f.write('{}\n'.format(len(model.state_dict().keys()))) + for k, v in model.state_dict().items(): + vr = v.reshape(-1).cpu().numpy() + f.write('{} {} '.format(k, len(vr))) + for vv in vr: + f.write(' ') + f.write(struct.pack('>f', float(vv)).hex()) + f.write('\n') diff --git a/images/bus.jpg b/images/bus.jpg new file mode 100644 index 0000000..40eaaf5 Binary files /dev/null and b/images/bus.jpg differ diff --git a/images/cat.jpg b/images/cat.jpg new file mode 100644 index 0000000..df4a907 Binary files /dev/null and b/images/cat.jpg differ diff --git a/images/dog.jpg b/images/dog.jpg new file mode 100644 index 0000000..e76c295 Binary files /dev/null and b/images/dog.jpg differ diff --git a/images/zidane.jpg b/images/zidane.jpg new file mode 100644 index 0000000..eeab1cd Binary files /dev/null and b/images/zidane.jpg differ diff --git a/include/block.h b/include/block.h new file mode 100644 index 0000000..1816e01 --- /dev/null +++ b/include/block.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include +#include "NvInfer.h" + +std::map loadWeights(const std::string file); + +nvinfer1::IScaleLayer *addBatchNorm2d(nvinfer1::INetworkDefinition *network, + std::map weightMap, + nvinfer1::ITensor &input, std::string lname, float eps); + +nvinfer1::IElementWiseLayer *convBnSiLU(nvinfer1::INetworkDefinition *network, + std::map weightMap, nvinfer1::ITensor &input, + int ch, int k, int s, int p, std::string lname); + +nvinfer1::IElementWiseLayer *C2F(nvinfer1::INetworkDefinition *network, + std::map weightMap, nvinfer1::ITensor &input, int c1, + int c2, int n, bool shortcut, float e, std::string lname); + +nvinfer1::IElementWiseLayer *C2(nvinfer1::INetworkDefinition *network, + std::map &weightMap, nvinfer1::ITensor &input, int c1, + int c2, int n, bool shortcut, float e, std::string lname); + +nvinfer1::IElementWiseLayer *SPPF(nvinfer1::INetworkDefinition *network, + std::map weightMap, nvinfer1::ITensor &input, int c1, + int c2, int k, std::string lname); + +nvinfer1::IShuffleLayer *DFL(nvinfer1::INetworkDefinition *network, std::map weightMap, + nvinfer1::ITensor &input, int ch, int grid, int k, int s, int p, std::string lname); + +nvinfer1::IPluginV2Layer *addYoLoLayer(nvinfer1::INetworkDefinition *network, + std::vector dets, const int *px_arry, + int px_arry_num, bool is_segmentation, bool is_pose); diff --git a/include/calibrator.h b/include/calibrator.h new file mode 100644 index 0000000..9bb60a7 --- /dev/null +++ b/include/calibrator.h @@ -0,0 +1,39 @@ +#ifndef ENTROPY_CALIBRATOR_H +#define ENTROPY_CALIBRATOR_H + +#include +#include +#include +#include "macros.h" + +//! \class Int8EntropyCalibrator2 +//! +//! \brief Implements Entropy calibrator 2. +//! CalibrationAlgoType is kENTROPY_CALIBRATION_2. +//! +class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 +{ +public: + Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true); + virtual ~Int8EntropyCalibrator2(); + int getBatchSize() const TRT_NOEXCEPT override; + bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override; + const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override; + void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override; + +private: + int batchsize_; + int input_w_; + int input_h_; + int img_idx_; + std::string img_dir_; + std::vector img_files_; + size_t input_count_; + std::string calib_table_name_; + const char* input_blob_name_; + bool read_cache_; + void* device_input_; + std::vector calib_cache_; +}; + +#endif // ENTROPY_CALIBRATOR_H diff --git a/include/config.h b/include/config.h new file mode 100644 index 0000000..e9c70c1 --- /dev/null +++ b/include/config.h @@ -0,0 +1,27 @@ +// #define USE_FP16 +// #define USE_FP32 +#define USE_INT8 + +const static char* kInputTensorName = "images"; +const static char* kOutputTensorName = "output"; +const static char *kProtoTensorName = "proto"; +const static int kNumClass = 80; +const static int kPoseNumClass = 1; +const static int kNumberOfPoints = 17; // number of keypoints total +const static int kBatchSize = 1; +const static int kGpuId = 0; +const static int kInputH = 640; +const static int kInputW = 640; +const static float kNmsThresh = 0.45f; +const static float kConfThresh = 0.5f; +const static float kConfThreshKeypoints = 0.5f; // keypoints confidence +const static int kMaxInputImageSize = 3000 * 3000; +const static int kMaxNumOutputBbox = 1000; +//Quantization input image folder path +const static char* kInputQuantizationFolder = "./coco_calib"; + +// Classfication model's number of classes +constexpr static int kClsNumClass = 1000; +// Classfication model's input shape +constexpr static int kClsInputH = 224; +constexpr static int kClsInputW = 224; diff --git a/include/cuda_utils.h b/include/cuda_utils.h new file mode 100644 index 0000000..8fbd319 --- /dev/null +++ b/include/cuda_utils.h @@ -0,0 +1,18 @@ +#ifndef TRTX_CUDA_UTILS_H_ +#define TRTX_CUDA_UTILS_H_ + +#include + +#ifndef CUDA_CHECK +#define CUDA_CHECK(callstr)\ + {\ + cudaError_t error_code = callstr;\ + if (error_code != cudaSuccess) {\ + std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\ + assert(0);\ + }\ + } +#endif // CUDA_CHECK + +#endif // TRTX_CUDA_UTILS_H_ + diff --git a/include/logging.h b/include/logging.h new file mode 100644 index 0000000..6b79a8b --- /dev/null +++ b/include/logging.h @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_LOGGING_H +#define TENSORRT_LOGGING_H + +#include "NvInferRuntimeCommon.h" +#include +#include +#include +#include +#include +#include +#include +#include "macros.h" + +using Severity = nvinfer1::ILogger::Severity; + +class LogStreamConsumerBuffer : public std::stringbuf +{ +public: + LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) + : mOutput(stream) + , mPrefix(prefix) + , mShouldLog(shouldLog) + { + } + + LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) + : mOutput(other.mOutput) + { + } + + ~LogStreamConsumerBuffer() + { + // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence + // std::streambuf::pptr() gives a pointer to the current position of the output sequence + // if the pointer to the beginning is not equal to the pointer to the current position, + // call putOutput() to log the output to the stream + if (pbase() != pptr()) + { + putOutput(); + } + } + + // synchronizes the stream buffer and returns 0 on success + // synchronizing the stream buffer consists of inserting the buffer contents into the stream, + // resetting the buffer and flushing the stream + virtual int sync() + { + putOutput(); + return 0; + } + + void putOutput() + { + if (mShouldLog) + { + // prepend timestamp + std::time_t timestamp = std::time(nullptr); + tm* tm_local = std::localtime(×tamp); + std::cout << "["; + std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; + std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; + std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; + std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; + std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; + std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; + // std::stringbuf::str() gets the string contents of the buffer + // insert the buffer contents pre-appended by the appropriate prefix into the stream + mOutput << mPrefix << str(); + // set the buffer to empty + str(""); + // flush the stream + mOutput.flush(); + } + } + + void setShouldLog(bool shouldLog) + { + mShouldLog = shouldLog; + } + +private: + std::ostream& mOutput; + std::string mPrefix; + bool mShouldLog; +}; + +//! +//! \class LogStreamConsumerBase +//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer +//! +class LogStreamConsumerBase +{ +public: + LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) + : mBuffer(stream, prefix, shouldLog) + { + } + +protected: + LogStreamConsumerBuffer mBuffer; +}; + +//! +//! \class LogStreamConsumer +//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. +//! Order of base classes is LogStreamConsumerBase and then std::ostream. +//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field +//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. +//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. +//! Please do not change the order of the parent classes. +//! +class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream +{ +public: + //! \brief Creates a LogStreamConsumer which logs messages with level severity. + //! Reportable severity determines if the messages are severe enough to be logged. + LogStreamConsumer(Severity reportableSeverity, Severity severity) + : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) + , std::ostream(&mBuffer) // links the stream buffer with the stream + , mShouldLog(severity <= reportableSeverity) + , mSeverity(severity) + { + } + + LogStreamConsumer(LogStreamConsumer&& other) + : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) + , std::ostream(&mBuffer) // links the stream buffer with the stream + , mShouldLog(other.mShouldLog) + , mSeverity(other.mSeverity) + { + } + + void setReportableSeverity(Severity reportableSeverity) + { + mShouldLog = mSeverity <= reportableSeverity; + mBuffer.setShouldLog(mShouldLog); + } + +private: + static std::ostream& severityOstream(Severity severity) + { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + static std::string severityPrefix(Severity severity) + { + switch (severity) + { + case Severity::kINTERNAL_ERROR: return "[F] "; + case Severity::kERROR: return "[E] "; + case Severity::kWARNING: return "[W] "; + case Severity::kINFO: return "[I] "; + case Severity::kVERBOSE: return "[V] "; + default: assert(0); return ""; + } + } + + bool mShouldLog; + Severity mSeverity; +}; + +//! \class Logger +//! +//! \brief Class which manages logging of TensorRT tools and samples +//! +//! \details This class provides a common interface for TensorRT tools and samples to log information to the console, +//! and supports logging two types of messages: +//! +//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) +//! - Test pass/fail messages +//! +//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is +//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. +//! +//! In the future, this class could be extended to support dumping test results to a file in some standard format +//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). +//! +//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger +//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT +//! library and messages coming from the sample. +//! +//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the +//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger +//! object. + +class Logger : public nvinfer1::ILogger +{ +public: + Logger(Severity severity = Severity::kWARNING) + : mReportableSeverity(severity) + { + } + + //! + //! \enum TestResult + //! \brief Represents the state of a given test + //! + enum class TestResult + { + kRUNNING, //!< The test is running + kPASSED, //!< The test passed + kFAILED, //!< The test failed + kWAIVED //!< The test was waived + }; + + //! + //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger + //! \return The nvinfer1::ILogger associated with this Logger + //! + //! TODO Once all samples are updated to use this method to register the logger with TensorRT, + //! we can eliminate the inheritance of Logger from ILogger + //! + nvinfer1::ILogger& getTRTLogger() + { + return *this; + } + + //! + //! \brief Implementation of the nvinfer1::ILogger::log() virtual method + //! + //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the + //! inheritance from nvinfer1::ILogger + //! + void log(Severity severity, const char* msg) TRT_NOEXCEPT override + { + LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; + } + + //! + //! \brief Method for controlling the verbosity of logging output + //! + //! \param severity The logger will only emit messages that have severity of this level or higher. + //! + void setReportableSeverity(Severity severity) + { + mReportableSeverity = severity; + } + + //! + //! \brief Opaque handle that holds logging information for a particular test + //! + //! This object is an opaque handle to information used by the Logger to print test results. + //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used + //! with Logger::reportTest{Start,End}(). + //! + class TestAtom + { + public: + TestAtom(TestAtom&&) = default; + + private: + friend class Logger; + + TestAtom(bool started, const std::string& name, const std::string& cmdline) + : mStarted(started) + , mName(name) + , mCmdline(cmdline) + { + } + + bool mStarted; + std::string mName; + std::string mCmdline; + }; + + //! + //! \brief Define a test for logging + //! + //! \param[in] name The name of the test. This should be a string starting with + //! "TensorRT" and containing dot-separated strings containing + //! the characters [A-Za-z0-9_]. + //! For example, "TensorRT.sample_googlenet" + //! \param[in] cmdline The command line used to reproduce the test + // + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + //! + static TestAtom defineTest(const std::string& name, const std::string& cmdline) + { + return TestAtom(false, name, cmdline); + } + + //! + //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments + //! as input + //! + //! \param[in] name The name of the test + //! \param[in] argc The number of command-line arguments + //! \param[in] argv The array of command-line arguments (given as C strings) + //! + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) + { + auto cmdline = genCmdlineString(argc, argv); + return defineTest(name, cmdline); + } + + //! + //! \brief Report that a test has started. + //! + //! \pre reportTestStart() has not been called yet for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has started + //! + static void reportTestStart(TestAtom& testAtom) + { + reportTestResult(testAtom, TestResult::kRUNNING); + assert(!testAtom.mStarted); + testAtom.mStarted = true; + } + + //! + //! \brief Report that a test has ended. + //! + //! \pre reportTestStart() has been called for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has ended + //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, + //! TestResult::kFAILED, TestResult::kWAIVED + //! + static void reportTestEnd(const TestAtom& testAtom, TestResult result) + { + assert(result != TestResult::kRUNNING); + assert(testAtom.mStarted); + reportTestResult(testAtom, result); + } + + static int reportPass(const TestAtom& testAtom) + { + reportTestEnd(testAtom, TestResult::kPASSED); + return EXIT_SUCCESS; + } + + static int reportFail(const TestAtom& testAtom) + { + reportTestEnd(testAtom, TestResult::kFAILED); + return EXIT_FAILURE; + } + + static int reportWaive(const TestAtom& testAtom) + { + reportTestEnd(testAtom, TestResult::kWAIVED); + return EXIT_SUCCESS; + } + + static int reportTest(const TestAtom& testAtom, bool pass) + { + return pass ? reportPass(testAtom) : reportFail(testAtom); + } + + Severity getReportableSeverity() const + { + return mReportableSeverity; + } + +private: + //! + //! \brief returns an appropriate string for prefixing a log message with the given severity + //! + static const char* severityPrefix(Severity severity) + { + switch (severity) + { + case Severity::kINTERNAL_ERROR: return "[F] "; + case Severity::kERROR: return "[E] "; + case Severity::kWARNING: return "[W] "; + case Severity::kINFO: return "[I] "; + case Severity::kVERBOSE: return "[V] "; + default: assert(0); return ""; + } + } + + //! + //! \brief returns an appropriate string for prefixing a test result message with the given result + //! + static const char* testResultString(TestResult result) + { + switch (result) + { + case TestResult::kRUNNING: return "RUNNING"; + case TestResult::kPASSED: return "PASSED"; + case TestResult::kFAILED: return "FAILED"; + case TestResult::kWAIVED: return "WAIVED"; + default: assert(0); return ""; + } + } + + //! + //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity + //! + static std::ostream& severityOstream(Severity severity) + { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + //! + //! \brief method that implements logging test results + //! + static void reportTestResult(const TestAtom& testAtom, TestResult result) + { + severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " + << testAtom.mCmdline << std::endl; + } + + //! + //! \brief generate a command line string from the given (argc, argv) values + //! + static std::string genCmdlineString(int argc, char const* const* argv) + { + std::stringstream ss; + for (int i = 0; i < argc; i++) + { + if (i > 0) + ss << " "; + ss << argv[i]; + } + return ss.str(); + } + + Severity mReportableSeverity; +}; + +namespace +{ + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE +//! +//! Example usage: +//! +//! LOG_VERBOSE(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO +//! +//! Example usage: +//! +//! LOG_INFO(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_INFO(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING +//! +//! Example usage: +//! +//! LOG_WARN(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_WARN(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR +//! +//! Example usage: +//! +//! LOG_ERROR(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_ERROR(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR +// ("fatal" severity) +//! +//! Example usage: +//! +//! LOG_FATAL(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_FATAL(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); +} + +} // anonymous namespace + +#endif // TENSORRT_LOGGING_H diff --git a/include/macros.h b/include/macros.h new file mode 100644 index 0000000..b187c94 --- /dev/null +++ b/include/macros.h @@ -0,0 +1,29 @@ +#ifndef __MACROS_H +#define __MACROS_H + +#include "NvInfer.h" + +#ifdef API_EXPORTS +#if defined(_MSC_VER) +#define API __declspec(dllexport) +#else +#define API __attribute__((visibility("default"))) +#endif +#else + +#if defined(_MSC_VER) +#define API __declspec(dllimport) +#else +#define API +#endif +#endif // API_EXPORTS + +#if NV_TENSORRT_MAJOR >= 8 +#define TRT_NOEXCEPT noexcept +#define TRT_CONST_ENQUEUE const +#else +#define TRT_NOEXCEPT +#define TRT_CONST_ENQUEUE +#endif + +#endif // __MACROS_H diff --git a/include/model.h b/include/model.h new file mode 100644 index 0000000..ad865df --- /dev/null +++ b/include/model.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include +#include "NvInfer.h" + +nvinfer1::IHostMemory *buildEngineYolov8Cls(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, + nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw); + +nvinfer1::IHostMemory *buildEngineYolov8Det(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, + nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw, + int &max_channels); + +nvinfer1::IHostMemory *buildEngineYolov8DetP6(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, + nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw, + int &max_channels); + +nvinfer1::IHostMemory *buildEngineYolov8DetP2(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, + nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw, + int &max_channels); + +nvinfer1::IHostMemory *buildEngineYolov8Seg(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, + nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw, + int &max_channels); + +nvinfer1::IHostMemory *buildEngineYolov8Pose(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, + nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw, + int &max_channels); + +nvinfer1::IHostMemory *buildEngineYolov8PoseP6(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, + nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw, + int &max_channels); \ No newline at end of file diff --git a/include/postprocess.h b/include/postprocess.h new file mode 100644 index 0000000..eb18d54 --- /dev/null +++ b/include/postprocess.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include "NvInfer.h" +#include "types.h" + +cv::Rect get_rect(cv::Mat& img, float bbox[4]); + +void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh = 0.5); + +void batch_nms(std::vector>& batch_res, float* output, int batch_size, int output_size, + float conf_thresh, float nms_thresh = 0.5); + +void draw_bbox(std::vector& img_batch, std::vector>& res_batch); + +void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch); + +void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, + int bbox_element, const std::vector& img_batch); + +void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, + int count); + +void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects, + cudaStream_t stream); + +void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream); + +void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, + std::unordered_map& labels_map); diff --git a/include/preprocess.h b/include/preprocess.h new file mode 100644 index 0000000..10bead9 --- /dev/null +++ b/include/preprocess.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include "NvInfer.h" +#include "types.h" +#include + + +void cuda_preprocess_init(int max_image_size); + +void cuda_preprocess_destroy(); + +void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height, cudaStream_t stream); + +void cuda_batch_preprocess(std::vector &img_batch, float *dst, int dst_width, int dst_height, cudaStream_t stream); + diff --git a/include/types.h b/include/types.h new file mode 100644 index 0000000..472c735 --- /dev/null +++ b/include/types.h @@ -0,0 +1,18 @@ +#pragma once +#include "config.h" + +struct alignas(float) Detection { + //center_x center_y w h + float bbox[4]; + float conf; // bbox_conf * cls_conf + float class_id; + float mask[32]; + float keypoints[51]; // 17*3 keypoints +}; + +struct AffineMatrix { + float value[6]; +}; + +const int bbox_element = + sizeof(AffineMatrix) / sizeof(float) + 1; // left, top, right, bottom, confidence, class, keepflag diff --git a/include/utils.h b/include/utils.h new file mode 100644 index 0000000..610c8e2 --- /dev/null +++ b/include/utils.h @@ -0,0 +1,86 @@ +#pragma once +#include +#include +#include + +static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) { + int w, h, x, y; + float r_w = input_w / (img.cols*1.0); + float r_h = input_h / (img.rows*1.0); + if (r_h > r_w) { + w = input_w; + h = r_w * img.rows; + x = 0; + y = (input_h - h) / 2; + } else { + w = r_h * img.cols; + h = input_h; + x = (input_w - w) / 2; + y = 0; + } + cv::Mat re(h, w, CV_8UC3); + cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR); + cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128)); + re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); + return out; +} + +static inline int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { + DIR *p_dir = opendir(p_dir_name); + if (p_dir == nullptr) { + return -1; + } + + struct dirent* p_file = nullptr; + while ((p_file = readdir(p_dir)) != nullptr) { + if (strcmp(p_file->d_name, ".") != 0 && + strcmp(p_file->d_name, "..") != 0) { + //std::string cur_file_name(p_dir_name); + //cur_file_name += "/"; + //cur_file_name += p_file->d_name; + std::string cur_file_name(p_file->d_name); + file_names.push_back(cur_file_name); + } + } + + closedir(p_dir); + return 0; +} + +// Function to trim leading and trailing whitespace from a string +static inline std::string trim_leading_whitespace(const std::string& str) { + size_t first = str.find_first_not_of(' '); + if (std::string::npos == first) { + return str; + } + size_t last = str.find_last_not_of(' '); + return str.substr(first, (last - first + 1)); +} + +// Src: https://stackoverflow.com/questions/16605967 +static inline std::string to_string_with_precision(const float a_value, const int n = 2) { + std::ostringstream out; + out.precision(n); + out << std::fixed << a_value; + return out.str(); +} + +static inline int read_labels(const std::string labels_filename, std::unordered_map& labels_map) { + std::ifstream file(labels_filename); + // Read each line of the file + std::string line; + int index = 0; + while (std::getline(file, line)) { + // Strip the line of any leading or trailing whitespace + line = trim_leading_whitespace(line); + + // Add the stripped line to the labels_map, using the loop index as the key + labels_map[index] = line; + index++; + } + // Close the file + file.close(); + + return 0; +} + diff --git a/plugin/yololayer.cu b/plugin/yololayer.cu new file mode 100644 index 0000000..c42b841 --- /dev/null +++ b/plugin/yololayer.cu @@ -0,0 +1,347 @@ +#include +#include +#include +#include +#include "cuda_utils.h" +#include "types.h" +#include "yololayer.h" + +namespace Tn { +template +void write(char*& buffer, const T& val) { + *reinterpret_cast(buffer) = val; + buffer += sizeof(T); +} + +template +void read(const char*& buffer, T& val) { + val = *reinterpret_cast(buffer); + buffer += sizeof(T); +} +} // namespace Tn + +__device__ float sigmoid(float x) { + return 1.0f / (1.0f + exp(-x)); +} + +namespace nvinfer1 { +YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, + int netHeight, int maxOut, bool is_segmentation, bool is_pose, const int* strides, + int stridesLength) { + + mClassCount = classCount; + mNumberofpoints = numberofpoints; + mConfthreshkeypoints = confthreshkeypoints; + mYoloV8NetWidth = netWidth; + mYoloV8netHeight = netHeight; + mMaxOutObject = maxOut; + mStridesLength = stridesLength; + mStrides = new int[stridesLength]; + memcpy(mStrides, strides, stridesLength * sizeof(int)); + is_segmentation_ = is_segmentation; + is_pose_ = is_pose; +} + +YoloLayerPlugin::~YoloLayerPlugin() { + if (mStrides != nullptr) { + delete[] mStrides; + mStrides = nullptr; + } +} + +YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) { + using namespace Tn; + const char *d = reinterpret_cast(data), *a = d; + read(d, mClassCount); + read(d, mNumberofpoints); + read(d, mConfthreshkeypoints); + read(d, mThreadCount); + read(d, mYoloV8NetWidth); + read(d, mYoloV8netHeight); + read(d, mMaxOutObject); + read(d, mStridesLength); + mStrides = new int[mStridesLength]; + for (int i = 0; i < mStridesLength; ++i) { + read(d, mStrides[i]); + } + read(d, is_segmentation_); + read(d, is_pose_); + + assert(d == a + length); +} + +void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT { + + using namespace Tn; + char *d = static_cast(buffer), *a = d; + write(d, mClassCount); + write(d, mNumberofpoints); + write(d, mConfthreshkeypoints); + write(d, mThreadCount); + write(d, mYoloV8NetWidth); + write(d, mYoloV8netHeight); + write(d, mMaxOutObject); + write(d, mStridesLength); + for (int i = 0; i < mStridesLength; ++i) { + write(d, mStrides[i]); + } + write(d, is_segmentation_); + write(d, is_pose_); + + assert(d == a + getSerializationSize()); +} + +size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT { + return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) + + sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) + + sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_); +} + +int YoloLayerPlugin::initialize() TRT_NOEXCEPT { + return 0; +} + +nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nbInputDims) TRT_NOEXCEPT { + int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float); + return nvinfer1::Dims3(total_size + 1, 1, 1); +} + +void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT { + mPluginNamespace = pluginNamespace; +} + +const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT { + return mPluginNamespace; +} + +nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, + int nbInputs) const TRT_NOEXCEPT { + return nvinfer1::DataType::kFLOAT; +} + +bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, + int nbInputs) const TRT_NOEXCEPT { + + return false; +} + +bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT { + + return false; +} + +void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput, + nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{}; + +void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, + IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{}; + +void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {} + +const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT { + + return "YoloLayer_TRT"; +} + +const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT { + return "1"; +} + +void YoloLayerPlugin::destroy() TRT_NOEXCEPT { + delete this; +} + +nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT { + + YoloLayerPlugin* p = + new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight, + mMaxOutObject, is_segmentation_, is_pose_, mStrides, mStridesLength); + p->setPluginNamespace(mPluginNamespace); + return p; +} + +int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs, + void* workspace, cudaStream_t stream) TRT_NOEXCEPT { + + forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize); + return 0; +} + +__device__ float Logist(float data) { + return 1.0f / (1.0f + expf(-data)); +}; + +__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h, + int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem, + bool is_segmentation, bool is_pose) { + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= numElements) + return; + + const int N_kpts = nk; + int total_grid = grid_h * grid_w; + int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0); + int batchIdx = idx / total_grid; + int elemIdx = idx % total_grid; + const float* curInput = input + batchIdx * total_grid * info_len; + int outputIdx = batchIdx * outputElem; + + int class_id = 0; + float max_cls_prob = 0.0; + for (int i = 4; i < 4 + classes; i++) { + float p = Logist(curInput[elemIdx + i * total_grid]); + if (p > max_cls_prob) { + max_cls_prob = p; + class_id = i - 4; + } + } + + if (max_cls_prob < 0.1) + return; + + int count = (int)atomicAdd(output + outputIdx, 1); + if (count >= maxoutobject) + return; + char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection); + Detection* det = (Detection*)(data); + + int row = elemIdx / grid_w; + int col = elemIdx % grid_w; + + det->conf = max_cls_prob; + det->class_id = class_id; + det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride; + det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride; + det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride; + det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride; + + if (is_segmentation) { + for (int k = 0; k < 32; ++k) { + det->mask[k] = curInput[elemIdx + (4 + classes + k) * total_grid]; + } + } + + if (is_pose) { + for (int kpt = 0; kpt < N_kpts; kpt++) { + int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3) * total_grid; + int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 1) * total_grid; + int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 2) * total_grid; + + float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]); + + float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride; + float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride; + + bool is_within_bbox = + kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3]; + + if (kpt_confidence < confkeypoints || !is_within_bbox) { + det->keypoints[kpt * 3] = -1; + det->keypoints[kpt * 3 + 1] = -1; + det->keypoints[kpt * 3 + 2] = -1; + } else { + det->keypoints[kpt * 3] = kpt_x; + det->keypoints[kpt * 3 + 1] = kpt_y; + det->keypoints[kpt * 3 + 2] = kpt_confidence; + } + } + } +} + +void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, + int mYoloV8NetWidth, int batchSize) { + int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float); + cudaMemsetAsync(output, 0, sizeof(float), stream); + for (int idx = 0; idx < batchSize; ++idx) { + CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream)); + } + int numElem = 0; + + // const int maxGrids = mStridesLength; + // int grids[maxGrids][2]; + // for (int i = 0; i < maxGrids; ++i) { + // grids[i][0] = mYoloV8netHeight / mStrides[i]; + // grids[i][1] = mYoloV8NetWidth / mStrides[i]; + // } + + int maxGrids = mStridesLength; + int flatGridsLen = 2 * maxGrids; + int* flatGrids = new int[flatGridsLen]; + + for (int i = 0; i < maxGrids; ++i) { + flatGrids[2 * i] = mYoloV8netHeight / mStrides[i]; + flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i]; + } + + for (unsigned int i = 0; i < maxGrids; i++) { + // Access the elements of the original 2D array from the flattened 1D array + int grid_h = flatGrids[2 * i]; // Corresponds to the access of grids[i][0] + int grid_w = flatGrids[2 * i + 1]; // Corresponds to the access of grids[i][1] + int stride = mStrides[i]; + numElem = grid_h * grid_w * batchSize; // Calculate the total number of elements + if (numElem < mThreadCount) // Adjust the thread count if needed + mThreadCount = numElem; + + // The CUDA kernel call remains unchanged + CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>( + inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints, + mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_); + } + + delete[] flatGrids; +} + +PluginFieldCollection YoloPluginCreator::mFC{}; +std::vector YoloPluginCreator::mPluginAttributes; + +YoloPluginCreator::YoloPluginCreator() { + mPluginAttributes.clear(); + mFC.nbFields = mPluginAttributes.size(); + mFC.fields = mPluginAttributes.data(); +} + +const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT { + return "YoloLayer_TRT"; +} + +const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT { + return "1"; +} + +const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT { + return &mFC; +} + +IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT { + assert(fc->nbFields == 1); + assert(strcmp(fc->fields[0].name, "combinedInfo") == 0); + const int* combinedInfo = static_cast(fc->fields[0].data); + int netinfo_count = 8; + int class_count = combinedInfo[0]; + int numberofpoints = combinedInfo[1]; + float confthreshkeypoints = combinedInfo[2]; + int input_w = combinedInfo[3]; + int input_h = combinedInfo[4]; + int max_output_object_count = combinedInfo[5]; + bool is_segmentation = combinedInfo[6]; + bool is_pose = combinedInfo[7]; + const int* px_arry = combinedInfo + netinfo_count; + int px_arry_length = fc->fields[0].length - netinfo_count; + YoloLayerPlugin* obj = + new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h, + max_output_object_count, is_segmentation, is_pose, px_arry, px_arry_length); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; +} + +IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, + size_t serialLength) TRT_NOEXCEPT { + // This object will be deleted when the network is destroyed, which will + // call YoloLayerPlugin::destroy() + YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; +} + +} // namespace nvinfer1 diff --git a/plugin/yololayer.h b/plugin/yololayer.h new file mode 100644 index 0000000..b516ad8 --- /dev/null +++ b/plugin/yololayer.h @@ -0,0 +1,109 @@ +#pragma once +#include +#include +#include "NvInfer.h" +#include "macros.h" +namespace nvinfer1 { +class API YoloLayerPlugin : public IPluginV2IOExt { + public: + YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight, + int maxOut, bool is_segmentation, bool is_pose, const int* strides, int stridesLength); + + YoloLayerPlugin(const void* data, size_t length); + ~YoloLayerPlugin(); + + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } + + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override; + + int initialize() TRT_NOEXCEPT override; + + virtual void terminate() TRT_NOEXCEPT override {} + + virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; } + + virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + + virtual size_t getSerializationSize() const TRT_NOEXCEPT override; + + virtual void serialize(void* buffer) const TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) const TRT_NOEXCEPT override { + return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; + } + + const char* getPluginType() const TRT_NOEXCEPT override; + + const char* getPluginVersion() const TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override; + + IPluginV2IOExt* clone() const TRT_NOEXCEPT override; + + void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override; + + const char* getPluginNamespace() const TRT_NOEXCEPT override; + + nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, + int32_t nbInputs) const TRT_NOEXCEPT; + + bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, + int nbInputs) const TRT_NOEXCEPT override; + + bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override; + + void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, + IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override; + + void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out, + int32_t nbOutput) TRT_NOEXCEPT override; + + void detachFromContext() TRT_NOEXCEPT override; + + private: + void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight, + int mYoloV8NetWidth, int batchSize); + int mThreadCount = 256; + const char* mPluginNamespace; + int mClassCount; + int mNumberofpoints; + float mConfthreshkeypoints; + int mYoloV8NetWidth; + int mYoloV8netHeight; + int mMaxOutObject; + bool is_segmentation_; + bool is_pose_; + int* mStrides; + int mStridesLength; +}; + +class API YoloPluginCreator : public IPluginCreator { + public: + YoloPluginCreator(); + ~YoloPluginCreator() override = default; + + const char* getPluginName() const TRT_NOEXCEPT override; + + const char* getPluginVersion() const TRT_NOEXCEPT override; + + const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override; + + nvinfer1::IPluginV2IOExt* createPlugin(const char* name, + const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override; + + nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, + size_t serialLength) TRT_NOEXCEPT override; + + void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; } + + const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); } + + private: + std::string mNamespace; + static PluginFieldCollection mFC; + static std::vector mPluginAttributes; +}; +REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); +} // namespace nvinfer1 diff --git a/src/block.cpp b/src/block.cpp new file mode 100644 index 0000000..a92a03a --- /dev/null +++ b/src/block.cpp @@ -0,0 +1,273 @@ +#include "block.h" +#include +#include +#include +#include +#include "config.h" +#include "yololayer.h" +#include "model.h" + + +std::map loadWeights(const std::string file) { + std::cout << "Loading weights: " << file << std::endl; + std::map WeightMap; + + std::ifstream input(file); + assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!"); + + int32_t count; + input >> count; + assert(count > 0 && "Invalid weight map file."); + + while (count--) { + nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0}; + uint32_t size; + + std::string name; + input >> name >> std::dec >> size; + wt.type = nvinfer1::DataType::kFLOAT; + + uint32_t *val = reinterpret_cast(malloc(sizeof(val) * size)); + for (uint32_t x = 0, y = size; x < y; x++) { + input >> std::hex >> val[x]; + } + wt.values = val; + wt.count = size; + WeightMap[name] = wt; + } + return WeightMap; +} + +nvinfer1::IScaleLayer *addBatchNorm2d(nvinfer1::INetworkDefinition *network, + std::map weightMap, + nvinfer1::ITensor &input, std::string lname, float eps) { + float *gamma = (float *) weightMap[lname + ".weight"].values; + float *beta = (float *) weightMap[lname + ".bias"].values; + float *mean = (float *) weightMap[lname + ".running_mean"].values; + float *var = (float *) weightMap[lname + ".running_var"].values; + int len = weightMap[lname + ".running_var"].count; + + float *scval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + scval[i] = gamma[i] / sqrt(var[i] + eps); + } + nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len}; + + float *shval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); + } + nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len}; + + float *pval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + pval[i] = 1.0; + } + nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len}; + weightMap[lname + ".scale"] = scale; + weightMap[lname + ".shift"] = shift; + weightMap[lname + ".power"] = power; + nvinfer1::IScaleLayer *output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power); + assert(output); + return output; +} + +nvinfer1::IElementWiseLayer *convBnSiLU(nvinfer1::INetworkDefinition *network, + std::map weightMap, nvinfer1::ITensor &input, + int ch, int k, int s, int p, std::string lname) { + nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; + nvinfer1::IConvolutionLayer *conv = + network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); + assert(conv); + conv->setStrideNd(nvinfer1::DimsHW{s, s}); + conv->setPaddingNd(nvinfer1::DimsHW{p, p}); + + nvinfer1::IScaleLayer *bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); + + nvinfer1::IActivationLayer *sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); + nvinfer1::IElementWiseLayer *ew = + network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); + assert(ew); + return ew; +} + +nvinfer1::ILayer *bottleneck(nvinfer1::INetworkDefinition *network, std::map weightMap, + nvinfer1::ITensor &input, int c1, int c2, bool shortcut, float e, std::string lname) { + nvinfer1::IElementWiseLayer *conv1 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".cv1"); + nvinfer1::IElementWiseLayer *conv2 = + convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, 3, 1, 1, lname + ".cv2"); + + if (shortcut && c1 == c2) { + nvinfer1::IElementWiseLayer *ew = + network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM); + return ew; + } + return conv2; +} + +nvinfer1::IElementWiseLayer *C2F(nvinfer1::INetworkDefinition *network, + std::map weightMap, nvinfer1::ITensor &input, int c1, + int c2, int n, bool shortcut, float e, std::string lname) { + int c_ = (float) c2 * e; + + nvinfer1::IElementWiseLayer *conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, 0, lname + ".cv1"); + nvinfer1::Dims d = conv1->getOutput(0)->getDimensions(); + + nvinfer1::ISliceLayer *split1 = + network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0}, + nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); + nvinfer1::ISliceLayer *split2 = + network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0}, + nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1}); + nvinfer1::ITensor *inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat = network->addConcatenation(inputTensor0, 2); + nvinfer1::ITensor *y1 = split2->getOutput(0); + for (int i = 0; i < n; i++) { + auto *b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i)); + y1 = b->getOutput(0); + + nvinfer1::ITensor *inputTensors[] = {cat->getOutput(0), b->getOutput(0)}; + cat = network->addConcatenation(inputTensors, 2); + } + + nvinfer1::IElementWiseLayer *conv2 = + convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2"); + + return conv2; +} + +nvinfer1::IElementWiseLayer *C2(nvinfer1::INetworkDefinition *network, + std::map &weightMap, nvinfer1::ITensor &input, int c1, + int c2, int n, bool shortcut, float e, std::string lname) { + assert(network != nullptr); + int hidden_channels = static_cast(c2 * e); + + // cv1 branch + nvinfer1::IElementWiseLayer *conv1 = + convBnSiLU(network, weightMap, input, 2 * hidden_channels, 1, 1, 0, lname + ".cv1"); + nvinfer1::ITensor *cv1_out = conv1->getOutput(0); + + // Split the output of cv1 into two tensors + nvinfer1::Dims dims = cv1_out->getDimensions(); + nvinfer1::ISliceLayer *split1 = + network->addSlice(*cv1_out, nvinfer1::Dims4{0, 0, 0, 0}, + nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]}, + nvinfer1::Dims4{1, 1, 1, 1}); + nvinfer1::ISliceLayer *split2 = + network->addSlice(*cv1_out, nvinfer1::Dims4{0, dims.d[1] / 2, 0, 0}, + nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]}, + nvinfer1::Dims4{1, 1, 1, 1}); + + // Create y1 bottleneck sequence + nvinfer1::ITensor *y1 = split1->getOutput(0); + for (int i = 0; i < n; ++i) { + auto *bottleneck_layer = bottleneck(network, weightMap, *y1, hidden_channels, hidden_channels, shortcut, 1.0, + lname + ".m." + std::to_string(i)); + y1 = bottleneck_layer->getOutput(0); // update 'y1' to be the output of the current bottleneck + } + + // Concatenate y1 with the second split of cv1 + nvinfer1::ITensor *concatInputs[2] = {y1, split2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat = network->addConcatenation(concatInputs, 2); + + // cv2 to produce the final output + nvinfer1::IElementWiseLayer *conv2 = + convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2"); + + return conv2; +} + +nvinfer1::IElementWiseLayer *SPPF(nvinfer1::INetworkDefinition *network, + std::map weightMap, nvinfer1::ITensor &input, int c1, + int c2, int k, std::string lname) { + int c_ = c1 / 2; + nvinfer1::IElementWiseLayer *conv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1"); + nvinfer1::IPoolingLayer *pool1 = + network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); + pool1->setStrideNd(nvinfer1::DimsHW{1, 1}); + pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); + nvinfer1::IPoolingLayer *pool2 = + network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); + pool2->setStrideNd(nvinfer1::DimsHW{1, 1}); + pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); + nvinfer1::IPoolingLayer *pool3 = + network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k}); + pool3->setStrideNd(nvinfer1::DimsHW{1, 1}); + pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2}); + nvinfer1::ITensor *inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0), + pool3->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat = network->addConcatenation(inputTensors, 4); + nvinfer1::IElementWiseLayer *conv2 = + convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2"); + return conv2; +} + +nvinfer1::IShuffleLayer *DFL(nvinfer1::INetworkDefinition *network, std::map weightMap, + nvinfer1::ITensor &input, int ch, int grid, int k, int s, int p, std::string lname) { + + nvinfer1::IShuffleLayer *shuffle1 = network->addShuffle(input); + shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid}); + shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3}); + nvinfer1::ISoftMaxLayer *softmax = network->addSoftMax(*shuffle1->getOutput(0)); + softmax->setAxes(1 << 1); + + nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; + nvinfer1::IConvolutionLayer *conv = + network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty); + conv->setStrideNd(nvinfer1::DimsHW{s, s}); + conv->setPaddingNd(nvinfer1::DimsHW{p, p}); + + nvinfer1::IShuffleLayer *shuffle2 = network->addShuffle(*conv->getOutput(0)); + shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid}); + + return shuffle2; +} + +nvinfer1::IPluginV2Layer *addYoLoLayer(nvinfer1::INetworkDefinition *network, + std::vector dets, const int *px_arry, + int px_arry_num, bool is_segmentation, bool is_pose) { + auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); + const int netinfo_count = 8; // Assuming the first 5 elements are for netinfo as per existing code. + const int total_count = netinfo_count + px_arry_num; // Total number of elements for netinfo and px_arry combined. + + std::vector combinedInfo(total_count); + // Fill in the first 5 elements as per existing netinfo. + combinedInfo[0] = is_pose ? kPoseNumClass : kNumClass; + combinedInfo[1] = kNumberOfPoints; + combinedInfo[2] = kConfThreshKeypoints; + combinedInfo[3] = kInputW; + combinedInfo[4] = kInputH; + combinedInfo[5] = kMaxNumOutputBbox; + combinedInfo[6] = is_segmentation; + combinedInfo[7] = is_pose; + + // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements. + std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count); + + // Now let's create the PluginField object to hold this combined information. + nvinfer1::PluginField pluginField; + pluginField.name = "combinedInfo"; // This can be any name that the plugin will recognize + pluginField.data = combinedInfo.data(); + pluginField.type = nvinfer1::PluginFieldType::kINT32; + pluginField.length = combinedInfo.size(); + + // Create the PluginFieldCollection to hold the PluginField object. + nvinfer1::PluginFieldCollection pluginFieldCollection; + pluginFieldCollection.nbFields = 1; // We have just one field, but it's a combined array + pluginFieldCollection.fields = &pluginField; + + // Create the plugin object using the PluginFieldCollection. + nvinfer1::IPluginV2 *pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection); + + // We assume that the plugin is to be added onto the network. + // Prepare input tensors for the YOLO Layer. + std::vector inputTensors; + for (auto det: dets) { + inputTensors.push_back(det->getOutput(0)); // Assuming each IConcatenationLayer has one output tensor. + } + + // Add the plugin to the network using the prepared input tensors. + nvinfer1::IPluginV2Layer *yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject); + + return yoloLayer; // Return the added YOLO layer. +} diff --git a/src/calibrator.cpp b/src/calibrator.cpp new file mode 100644 index 0000000..6202788 --- /dev/null +++ b/src/calibrator.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include "calibrator.h" +#include "cuda_utils.h" +#include "utils.h" + +Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, + const char* input_blob_name, bool read_cache) + : batchsize_(batchsize) + , input_w_(input_w) + , input_h_(input_h) + , img_idx_(0) + , img_dir_(img_dir) + , calib_table_name_(calib_table_name) + , input_blob_name_(input_blob_name) + , read_cache_(read_cache) +{ + input_count_ = 3 * input_w * input_h * batchsize; + CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float))); + read_files_in_dir(img_dir, img_files_); +} + +Int8EntropyCalibrator2::~Int8EntropyCalibrator2() +{ + CUDA_CHECK(cudaFree(device_input_)); +} + +int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT +{ + return batchsize_; +} + +bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT +{ + if (img_idx_ + batchsize_ > (int)img_files_.size()) { + return false; + } + + std::vector input_imgs_; + for (int i = img_idx_; i < img_idx_ + batchsize_; i++) { + std::cout << img_files_[i] << " " << i << std::endl; + cv::Mat temp = cv::imread(img_dir_ + "/" + img_files_[i]); + if (temp.empty()){ + std::cerr << "Fatal error: image cannot open!" << std::endl; + return false; + } + cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_); + input_imgs_.push_back(pr_img); + } + img_idx_ += batchsize_; + cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false); + CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice)); + assert(!strcmp(names[0], input_blob_name_)); + bindings[0] = device_input_; + return true; +} + +const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT +{ + std::cout << "reading calib cache: " << calib_table_name_ << std::endl; + calib_cache_.clear(); + std::ifstream input(calib_table_name_, std::ios::binary); + input >> std::noskipws; + if (read_cache_ && input.good()) + { + std::copy(std::istream_iterator(input), std::istream_iterator(), std::back_inserter(calib_cache_)); + } + length = calib_cache_.size(); + return length ? calib_cache_.data() : nullptr; +} + +void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT +{ + std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl; + std::ofstream output(calib_table_name_, std::ios::binary); + output.write(reinterpret_cast(cache), length); +} + diff --git a/src/model.cpp b/src/model.cpp new file mode 100644 index 0000000..af3301c --- /dev/null +++ b/src/model.cpp @@ -0,0 +1,2050 @@ +#include +#include + +#include "block.h" +#include "calibrator.h" +#include "config.h" +#include "model.h" + +static int get_width(int x, float gw, int max_channels, int divisor = 8) { + auto channel = int(ceil((x * gw) / divisor)) * divisor; + return channel >= max_channels ? max_channels : channel; +} + +static int get_depth(int x, float gd) { + if (x == 1) + return 1; + int r = round(x * gd); + if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0) + --r; + return std::max(r, 1); +} + +void calculateStrides(nvinfer1::IElementWiseLayer *conv_layers[], int size, int reference_size, int strides[]) { + for (int i = 0; i < size; ++i) { + nvinfer1::ILayer *layer = conv_layers[i]; + nvinfer1::Dims dims = layer->getOutput(0)->getDimensions(); + int feature_map_size = dims.d[2]; + strides[i] = reference_size / feature_map_size; + } +} + +nvinfer1::IHostMemory *buildEngineYolov8Cls(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, + nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw) { + std::map weightMap = loadWeights(wts_path); +// nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); + nvinfer1::INetworkDefinition *network = builder->createNetworkV2( + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + int max_channels = 1280; + // ****************************************** YOLOV8 INPUT ********************************************** + nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, + nvinfer1::Dims4{kBatchSize, 3, kClsInputH, kClsInputW}); + assert(data); + + // ***************************************** YOLOV8 BACKBONE ******************************************** + nvinfer1::IElementWiseLayer *conv0 = + convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); + nvinfer1::IElementWiseLayer *conv1 = + convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); + // C2 Block (11233) + nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); + nvinfer1::IElementWiseLayer *conv3 = + convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); + // C2 Block Sequence (22466) + nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); + nvinfer1::IElementWiseLayer *conv5 = + convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); + // C2 Block Sequence (22466) + nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + nvinfer1::IElementWiseLayer *conv7 = + convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); + // C2 Block (11233) + nvinfer1::IElementWiseLayer *conv8 = + C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + + // ********************************************* YOLOV8 HEAD ********************************************* + + auto conv_class = convBnSiLU(network, weightMap, *conv8->getOutput(0), 1280, 1, 1, 0, "model.9.conv"); + // Adjusted code + nvinfer1::Dims dims = + conv_class->getOutput(0)->getDimensions(); // Obtain the dimensions of the output of conv_class + assert(dims.nbDims == 4); // Make sure there are exactly 3 dimensions (channels, height, width) + + nvinfer1::IPoolingLayer *pool2 = network->addPoolingNd(*conv_class->getOutput(0), nvinfer1::PoolingType::kAVERAGE, + nvinfer1::DimsHW{dims.d[2], dims.d[3]}); + assert(pool2); + + // Fully connected layer declaration + auto shuffle_0 = network->addShuffle(*pool2->getOutput(0)); + shuffle_0->setReshapeDimensions(nvinfer1::Dims2{kBatchSize, max_channels}); + auto linear_weight = weightMap["model.9.linear.weight"]; + auto constant_weight = network->addConstant(nvinfer1::Dims2{kClsNumClass, max_channels}, linear_weight); + auto constant_bias = network->addConstant(nvinfer1::Dims2{kBatchSize, kClsNumClass}, + weightMap["model.9.linear.bias"]); + auto linear_matrix_multipy = network->addMatrixMultiply(*shuffle_0->getOutput(0), + nvinfer1::MatrixOperation::kNONE, + *constant_weight->getOutput(0), + nvinfer1::MatrixOperation::kTRANSPOSE); + auto yolo = network->addElementWise(*linear_matrix_multipy->getOutput(0), *constant_bias->getOutput(0), + nvinfer1::ElementWiseOperation::kSUM); + assert(yolo); + + // Set the name for the output tensor and mark it as network output + yolo->getOutput(0)->setName(kOutputTensorName); + network->markOutput(*yolo->getOutput(0)); + + // Set the maximum batch size and workspace size + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); + + // Configuration according to the precision mode being used +#if defined(USE_FP16) + config->setFlag(nvinfer1::BuilderFlag::kFP16); +#elif defined(USE_INT8) + std::cout << "Your platform supports int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; + assert(builder->platformHasFastInt8()); + config->setFlag(nvinfer1::BuilderFlag::kINT8); + auto *calibrator = new Int8EntropyCalibrator2(1, kClsInputW, kClsInputH, kInputQuantizationFolder, + "int8calib.table", kInputTensorName); + config->setInt8Calibrator(calibrator); +#endif + + // Begin building the engine; this may take a while + std::cout << "Building engine, please wait for a while..." << std::endl; + nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + // Cleanup the network definition and allocated weights + delete network; + + for (auto &mem: weightMap) { + free((void *) (mem.second.values)); + } + return serialized_model; +} + +nvinfer1::IHostMemory *buildEngineYolov8Det(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, + nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw, + int &max_channels) { + std::map weightMap = loadWeights(wts_path); +// nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); + nvinfer1::INetworkDefinition *network = builder->createNetworkV2( + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + + /******************************************************************************************************* + ****************************************** YOLOV8 INPUT ********************************************** + *******************************************************************************************************/ + nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); + assert(data); + + /******************************************************************************************************* + ***************************************** YOLOV8 BACKBONE ******************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer *conv0 = + convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); + nvinfer1::IElementWiseLayer *conv1 = + convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); + // 11233 + nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); + nvinfer1::IElementWiseLayer *conv3 = + convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); + // 22466 + nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); + nvinfer1::IElementWiseLayer *conv5 = + convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); + // 22466 + nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + nvinfer1::IElementWiseLayer *conv7 = + convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); + // 11233 + nvinfer1::IElementWiseLayer *conv8 = + C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + nvinfer1::IElementWiseLayer *conv9 = + SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), 5, "model.9"); + /******************************************************************************************************* + ********************************************* YOLOV8 HEAD ******************************************** + *******************************************************************************************************/ + float scale[] = {1.0, 1.0, 2.0, 2.0}; + nvinfer1::IResizeLayer *upsample10 = network->addResize(*conv9->getOutput(0)); + assert(upsample10); + upsample10->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample10->setScales(scale, 4); + + nvinfer1::ITensor *inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat11 = network->addConcatenation(inputTensor11, 2); + + nvinfer1::IElementWiseLayer *conv12 = + C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); + + nvinfer1::IResizeLayer *upsample13 = network->addResize(*conv12->getOutput(0)); + assert(upsample13); + upsample13->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample13->setScales(scale, 4); + + nvinfer1::ITensor *inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat14 = network->addConcatenation(inputTensor14, 2); + + nvinfer1::IElementWiseLayer *conv15 = + C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); + nvinfer1::IElementWiseLayer *conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), + get_width(256, gw, max_channels), 3, 2, 1, "model.16"); + nvinfer1::ITensor *inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat17 = network->addConcatenation(inputTensor17, 2); + nvinfer1::IElementWiseLayer *conv18 = + C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); + nvinfer1::IElementWiseLayer *conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), + get_width(512, gw, max_channels), 3, 2, 1, "model.19"); + nvinfer1::ITensor *inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat20 = network->addConcatenation(inputTensor20, 2); + nvinfer1::IElementWiseLayer *conv21 = + C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); + + /******************************************************************************************************* + ********************************************* YOLOV8 OUTPUT ****************************************** + *******************************************************************************************************/ + int base_in_channel = (gw == 1.25) ? 80 : 64; + int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels); + + // output0 + nvinfer1::IElementWiseLayer *conv22_cv2_0_0 = + convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0"); + nvinfer1::IElementWiseLayer *conv22_cv2_0_1 = + convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1"); + nvinfer1::IConvolutionLayer *conv22_cv2_0_2 = + network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]); + conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer *conv22_cv3_0_0 = + convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0"); + nvinfer1::IElementWiseLayer *conv22_cv3_0_1 = + convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.0.1"); + nvinfer1::IConvolutionLayer *conv22_cv3_0_2 = + network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]); + conv22_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_0 = network->addConcatenation(inputTensor22_0, 2); + + // output1 + nvinfer1::IElementWiseLayer *conv22_cv2_1_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0"); + nvinfer1::IElementWiseLayer *conv22_cv2_1_1 = + convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1"); + nvinfer1::IConvolutionLayer *conv22_cv2_1_2 = + network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]); + conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer *conv22_cv3_1_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0"); + nvinfer1::IElementWiseLayer *conv22_cv3_1_1 = + convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.1.1"); + nvinfer1::IConvolutionLayer *conv22_cv3_1_2 = + network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]); + conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_1 = network->addConcatenation(inputTensor22_1, 2); + + // output2 + nvinfer1::IElementWiseLayer *conv22_cv2_2_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0"); + nvinfer1::IElementWiseLayer *conv22_cv2_2_1 = + convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1"); + nvinfer1::IConvolutionLayer *conv22_cv2_2_2 = + network->addConvolutionNd(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]); + nvinfer1::IElementWiseLayer *conv22_cv3_2_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0"); + nvinfer1::IElementWiseLayer *conv22_cv3_2_1 = + convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.2.1"); + nvinfer1::IConvolutionLayer *conv22_cv3_2_2 = + network->addConvolutionNd(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]); + nvinfer1::ITensor *inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_2 = network->addConcatenation(inputTensor22_2, 2); + + /******************************************************************************************************* + ********************************************* YOLOV8 DETECT ****************************************** + *******************************************************************************************************/ + + nvinfer1::IElementWiseLayer *conv_layers[] = {conv3, conv5, conv7}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); + int stridesLength = sizeof(strides) / sizeof(int); + + nvinfer1::IShuffleLayer *shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); + shuffle22_0->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer *split22_0_0 = network->addSlice( + *shuffle22_0->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split22_0_1 = network->addSlice( + *shuffle22_0->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + + nvinfer1::IShuffleLayer *dfl22_0 = + DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.22.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2); + cat22_dfl_0->setAxis(1); + + nvinfer1::IShuffleLayer *shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); + shuffle22_1->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer *split22_1_0 = network->addSlice( + *shuffle22_1->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split22_1_1 = network->addSlice( + *shuffle22_1->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl22_1 = + DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.22.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2); + cat22_dfl_1->setAxis(1); + + nvinfer1::IShuffleLayer *shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); + shuffle22_2->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer *split22_2_0 = network->addSlice( + *shuffle22_2->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split22_2_1 = network->addSlice( + *shuffle22_2->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl22_2 = + DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.22.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2); + cat22_dfl_2->setAxis(1); + + nvinfer1::IPluginV2Layer *yolo = + addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, + strides, stridesLength, false, false); + + yolo->getOutput(0)->setName(kOutputTensorName); + network->markOutput(*yolo->getOutput(0)); + + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); + +#if defined(USE_FP16) + config->setFlag(nvinfer1::BuilderFlag::kFP16); +#elif defined(USE_INT8) + std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; + assert(builder->platformHasFastInt8()); + config->setFlag(nvinfer1::BuilderFlag::kINT8); + auto *calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", + kInputTensorName); + config->setInt8Calibrator(calibrator); +#endif + + std::cout << "Building engine, please wait for a while..." << std::endl; + nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + delete network; + + for (auto &mem: weightMap) { + free((void *) (mem.second.values)); + } + return serialized_model; +} + +nvinfer1::IHostMemory * +buildEngineYolov8DetP6(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt, + const std::string &wts_path, float &gd, float &gw, int &max_channels) { + std::map weightMap = loadWeights(wts_path); +// nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); + nvinfer1::INetworkDefinition *network = builder->createNetworkV2( + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + /******************************************************************************************************* + ****************************************** YOLOV8 INPUT ********************************************** + *******************************************************************************************************/ + nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); + assert(data); + /******************************************************************************************************* + ***************************************** YOLOV8 BACKBONE ******************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer *conv0 = + convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); + nvinfer1::IElementWiseLayer *conv1 = + convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); + // 11233 + nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); + nvinfer1::IElementWiseLayer *conv3 = + convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); + // 22466 + nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); + nvinfer1::IElementWiseLayer *conv5 = + convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); + // 22466 + nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + + nvinfer1::IElementWiseLayer *conv7 = + convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7"); + nvinfer1::IElementWiseLayer *conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels), + get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + + nvinfer1::IElementWiseLayer *conv9 = + convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9"); + nvinfer1::IElementWiseLayer *conv10 = + C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10"); + + nvinfer1::IElementWiseLayer *conv11 = + SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), 5, "model.11"); + + /******************************************************************************************************* + ********************************************* YOLOV8 HEAD ******************************************** + *******************************************************************************************************/ + // Head + float scale[] = {1.0, 1.0, 2.0, 2.0}; // scale used for upsampling + + // P5 + nvinfer1::IResizeLayer *upsample12 = network->addResize(*conv11->getOutput(0)); + upsample12->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample12->setScales(scale, 4); + nvinfer1::ITensor *concat13_inputs[] = {upsample12->getOutput(0), conv8->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat13 = network->addConcatenation(concat13_inputs, 2); + nvinfer1::IElementWiseLayer *conv14 = + C2(network, weightMap, *concat13->getOutput(0), get_width(768, gw, max_channels), + get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.14"); + + // P4 + nvinfer1::IResizeLayer *upsample15 = network->addResize(*conv14->getOutput(0)); + upsample15->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample15->setScales(scale, 4); + nvinfer1::ITensor *concat16_inputs[] = {upsample15->getOutput(0), conv6->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat16 = network->addConcatenation(concat16_inputs, 2); + nvinfer1::IElementWiseLayer *conv17 = + C2(network, weightMap, *concat16->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.17"); + + // P3 + nvinfer1::IResizeLayer *upsample18 = network->addResize(*conv17->getOutput(0)); + upsample18->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample18->setScales(scale, 4); + nvinfer1::ITensor *concat19_inputs[] = {upsample18->getOutput(0), conv4->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat19 = network->addConcatenation(concat19_inputs, 2); + nvinfer1::IElementWiseLayer *conv20 = + C2(network, weightMap, *concat19->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.20"); + + // Additional layers for P4, P5, P6 + // P4/16-medium + nvinfer1::IElementWiseLayer *conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0), + get_width(256, gw, max_channels), 3, 2, 1, "model.21"); + nvinfer1::ITensor *concat22_inputs[] = {conv21->getOutput(0), conv17->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat22 = network->addConcatenation(concat22_inputs, 2); + nvinfer1::IElementWiseLayer *conv23 = + C2(network, weightMap, *concat22->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.23"); + + // P5/32-large + nvinfer1::IElementWiseLayer *conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0), + get_width(512, gw, max_channels), 3, 2, 1, "model.24"); + nvinfer1::ITensor *concat25_inputs[] = {conv24->getOutput(0), conv14->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat25 = network->addConcatenation(concat25_inputs, 2); + nvinfer1::IElementWiseLayer *conv26 = + C2(network, weightMap, *concat25->getOutput(0), get_width(768, gw, max_channels), + get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.26"); + + // P6/64-xlarge + nvinfer1::IElementWiseLayer *conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0), + get_width(768, gw, max_channels), 3, 2, 1, "model.27"); + nvinfer1::ITensor *concat28_inputs[] = {conv27->getOutput(0), conv11->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat28 = network->addConcatenation(concat28_inputs, 2); + nvinfer1::IElementWiseLayer *conv29 = + C2(network, weightMap, *concat28->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29"); + + /******************************************************************************************************* + ********************************************* YOLOV8 OUTPUT ****************************************** + *******************************************************************************************************/ + int base_in_channel = (gw == 1.25) ? 80 : 64; + int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels); + + // output0 + nvinfer1::IElementWiseLayer *conv30_cv2_0_0 = + convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.0"); + nvinfer1::IElementWiseLayer *conv30_cv2_0_1 = + convBnSiLU(network, weightMap, *conv30_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.1"); + nvinfer1::IConvolutionLayer *conv30_cv2_0_2 = + network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]); + conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + + nvinfer1::IElementWiseLayer *conv30_cv3_0_0 = + convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0"); + nvinfer1::IElementWiseLayer *conv30_cv3_0_1 = + convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, + "model.30.cv3.0.1"); + nvinfer1::IConvolutionLayer *conv30_cv3_0_2 = + network->addConvolutionNd(*conv30_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.0.2.weight"], weightMap["model.30.cv3.0.2.bias"]); + conv30_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor30_0[] = {conv30_cv2_0_2->getOutput(0), conv30_cv3_0_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_0 = network->addConcatenation(inputTensor30_0, 2); + + // output1 + nvinfer1::IElementWiseLayer *conv30_cv2_1_0 = + convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.0"); + nvinfer1::IElementWiseLayer *conv30_cv2_1_1 = + convBnSiLU(network, weightMap, *conv30_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.1"); + nvinfer1::IConvolutionLayer *conv30_cv2_1_2 = + network->addConvolutionNd(*conv30_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.1.2.weight"], weightMap["model.30.cv2.1.2.bias"]); + conv30_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + + nvinfer1::IElementWiseLayer *conv30_cv3_1_0 = + convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.0"); + nvinfer1::IElementWiseLayer *conv30_cv3_1_1 = + convBnSiLU(network, weightMap, *conv30_cv3_1_0->getOutput(0), base_out_channel, 3, 1, 1, + "model.30.cv3.1.1"); + nvinfer1::IConvolutionLayer *conv30_cv3_1_2 = + network->addConvolutionNd(*conv30_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.1.2.weight"], weightMap["model.30.cv3.1.2.bias"]); + conv30_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor30_1[] = {conv30_cv2_1_2->getOutput(0), conv30_cv3_1_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_1 = network->addConcatenation(inputTensor30_1, 2); + + // output2 + nvinfer1::IElementWiseLayer *conv30_cv2_2_0 = + convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.0"); + nvinfer1::IElementWiseLayer *conv30_cv2_2_1 = + convBnSiLU(network, weightMap, *conv30_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.1"); + nvinfer1::IConvolutionLayer *conv30_cv2_2_2 = + network->addConvolutionNd(*conv30_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.2.2.weight"], weightMap["model.30.cv2.2.2.bias"]); + conv30_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + + nvinfer1::IElementWiseLayer *conv30_cv3_2_0 = + convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.0"); + nvinfer1::IElementWiseLayer *conv30_cv3_2_1 = convBnSiLU(network, weightMap, *conv30_cv3_2_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.30.cv3.2.1"); + nvinfer1::IConvolutionLayer *conv30_cv3_2_2 = + network->addConvolutionNd(*conv30_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.2.2.weight"], weightMap["model.30.cv3.2.2.bias"]); + conv30_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor30_2[] = {conv30_cv2_2_2->getOutput(0), conv30_cv3_2_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_2 = network->addConcatenation(inputTensor30_2, 2); + + // output3 + nvinfer1::IElementWiseLayer *conv30_cv2_3_0 = + convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.0"); + nvinfer1::IElementWiseLayer *conv30_cv2_3_1 = + convBnSiLU(network, weightMap, *conv30_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.1"); + nvinfer1::IConvolutionLayer *conv30_cv2_3_2 = + network->addConvolutionNd(*conv30_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.3.2.weight"], weightMap["model.30.cv2.3.2.bias"]); + conv30_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + + nvinfer1::IElementWiseLayer *conv30_cv3_3_0 = + convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.0"); + nvinfer1::IElementWiseLayer *conv30_cv3_3_1 = convBnSiLU(network, weightMap, *conv30_cv3_3_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.30.cv3.3.1"); + nvinfer1::IConvolutionLayer *conv30_cv3_3_2 = + network->addConvolutionNd(*conv30_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.3.2.weight"], weightMap["model.30.cv3.3.2.bias"]); + conv30_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor30_3[] = {conv30_cv2_3_2->getOutput(0), conv30_cv3_3_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_3 = network->addConcatenation(inputTensor30_3, 2); + + /******************************************************************************************************* + ********************************************* YOLOV8 DETECT ****************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer *conv_layers[] = {conv3, conv5, conv7, conv9}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); + int stridesLength = sizeof(strides) / sizeof(int); + + // P3 processing steps (remains unchanged) + nvinfer1::IShuffleLayer *shuffle30_0 = + network->addShuffle(*cat30_0->getOutput(0)); // Reusing the previous cat30_0 as P3 concatenation layer + shuffle30_0->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer *split30_0_0 = network->addSlice( + *shuffle30_0->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split30_0_1 = network->addSlice( + *shuffle30_0->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl30_0 = + DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.30.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 2); + cat30_dfl_0->setAxis(1); + + // P4 processing steps (remains unchanged) + nvinfer1::IShuffleLayer *shuffle30_1 = + network->addShuffle(*cat30_1->getOutput(0)); // Reusing the previous cat30_1 as P4 concatenation layer + shuffle30_1->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer *split30_1_0 = network->addSlice( + *shuffle30_1->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split30_1_1 = network->addSlice( + *shuffle30_1->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl30_1 = + DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.30.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 2); + cat30_dfl_1->setAxis(1); + + // P5 processing steps (remains unchanged) + nvinfer1::IShuffleLayer *shuffle30_2 = + network->addShuffle(*cat30_2->getOutput(0)); // Reusing the previous cat30_2 as P5 concatenation layer + shuffle30_2->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer *split30_2_0 = network->addSlice( + *shuffle30_2->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split30_2_1 = network->addSlice( + *shuffle30_2->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl30_2 = + DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.30.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 2); + cat30_dfl_2->setAxis(1); + + // P6 processing steps + nvinfer1::IShuffleLayer *shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0)); + shuffle30_3->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}); + nvinfer1::ISliceLayer *split30_3_0 = network->addSlice( + *shuffle30_3->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[3]) * (kInputW / strides[3])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split30_3_1 = network->addSlice( + *shuffle30_3->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl30_3 = + DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1, + 1, 0, "model.30.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 2); + cat30_dfl_3->setAxis(1); + + nvinfer1::IPluginV2Layer *yolo = addYoLoLayer( + network, std::vector{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3}, + strides, stridesLength, false, false); + yolo->getOutput(0)->setName(kOutputTensorName); + network->markOutput(*yolo->getOutput(0)); + + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); + +#if defined(USE_FP16) + config->setFlag(nvinfer1::BuilderFlag::kFP16); +#elif defined(USE_INT8) + std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; + assert(builder->platformHasFastInt8()); + config->setFlag(nvinfer1::BuilderFlag::kINT8); + auto *calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", + kInputTensorName); + config->setInt8Calibrator(calibrator); +#endif + + std::cout << "Building engine, please wait for a while..." << std::endl; + nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + delete network; + + for (auto &mem: weightMap) { + free((void *) (mem.second.values)); + } + return serialized_model; +} + +nvinfer1::IHostMemory * +buildEngineYolov8DetP2(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt, + const std::string &wts_path, float &gd, float &gw, int &max_channels) { + std::map weightMap = loadWeights(wts_path); +// nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); + nvinfer1::INetworkDefinition *network = builder->createNetworkV2( + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + + /******************************************************************************************************* + ****************************************** YOLOV8 INPUT ********************************************** + *******************************************************************************************************/ + nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); + assert(data); + + /******************************************************************************************************* + ***************************************** YOLOV8 BACKBONE ******************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer *conv0 = + convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); + nvinfer1::IElementWiseLayer *conv1 = + convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); + // 11233 + nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); + nvinfer1::IElementWiseLayer *conv3 = + convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); + // 22466 + nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); + nvinfer1::IElementWiseLayer *conv5 = + convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); + // 22466 + nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + nvinfer1::IElementWiseLayer *conv7 = + convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); + // 11233 + nvinfer1::IElementWiseLayer *conv8 = + C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + nvinfer1::IElementWiseLayer *conv9 = + SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), 5, "model.9"); + + /******************************************************************************************************* + ********************************************* YOLOV8 HEAD ******************************************** + *******************************************************************************************************/ + // Head + float scale[] = {1.0, 1.0, 2.0, 2.0}; // scale used for upsampling + + // P4 + nvinfer1::IResizeLayer *upsample10 = network->addResize( + *conv9->getOutput(0)); // Assuming conv9 is the last layer of the backbone as per P5 in your first section. + upsample10->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample10->setScales(scale, 4); + nvinfer1::ITensor *concat11_inputs[] = { + upsample10->getOutput(0), + conv6->getOutput(0)}; // Assuming conv6 corresponds to "backbone P4" as per your pseudocode + nvinfer1::IConcatenationLayer *concat11 = network->addConcatenation(concat11_inputs, 2); + nvinfer1::IElementWiseLayer *conv12 = + C2F(network, weightMap, *concat11->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); + + // P3 + nvinfer1::IResizeLayer *upsample13 = network->addResize(*conv12->getOutput(0)); + upsample13->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample13->setScales(scale, 4); + nvinfer1::ITensor *concat14_inputs[] = {upsample13->getOutput(0), + conv4->getOutput(0)}; // Assuming conv4 corresponds to "backbone P3" + nvinfer1::IConcatenationLayer *concat14 = network->addConcatenation(concat14_inputs, 2); + nvinfer1::IElementWiseLayer *conv15 = + C2F(network, weightMap, *concat14->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); + + // P2 + nvinfer1::IResizeLayer *upsample16 = network->addResize(*conv15->getOutput(0)); + upsample16->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample16->setScales(scale, 4); + nvinfer1::ITensor *concat17_inputs[] = {upsample16->getOutput(0), + conv2->getOutput(0)}; // Assuming conv2 corresponds to "backbone P2" + nvinfer1::IConcatenationLayer *concat17 = network->addConcatenation(concat17_inputs, 2); + nvinfer1::IElementWiseLayer *conv18 = + C2F(network, weightMap, *concat17->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); + + // Additional layers for P3, P4, P5 + // Downsample and concatenate for P3 + nvinfer1::IElementWiseLayer *conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), + get_width(128, gw, max_channels), 3, 2, 1, "model.19"); + nvinfer1::ITensor *concat20_inputs[] = { + conv19->getOutput(0), conv15->getOutput(0)}; // concatenate with higher-resolution feature map from P3 + nvinfer1::IConcatenationLayer *concat20 = network->addConcatenation(concat20_inputs, 2); + nvinfer1::IElementWiseLayer *conv21 = + C2F(network, weightMap, *concat20->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); + + // Downsample and concatenate for P4 + nvinfer1::IElementWiseLayer *conv22 = convBnSiLU(network, weightMap, *conv21->getOutput(0), + get_width(256, gw, max_channels), 3, 2, 1, "model.22"); + nvinfer1::ITensor *concat23_inputs[] = { + conv22->getOutput(0), conv12->getOutput(0)}; // concatenate with higher-resolution feature map from P4 + nvinfer1::IConcatenationLayer *concat23 = network->addConcatenation(concat23_inputs, 2); + nvinfer1::IElementWiseLayer *conv24 = + C2F(network, weightMap, *concat23->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.24"); + + // Downsample and concatenate for P5 + nvinfer1::IElementWiseLayer *conv25 = convBnSiLU(network, weightMap, *conv24->getOutput(0), + get_width(512, gw, max_channels), 3, 2, 1, "model.25"); + nvinfer1::ITensor *concat26_inputs[] = { + conv25->getOutput(0), conv9->getOutput(0)}; // concatenate with higher-resolution feature map from P5 + nvinfer1::IConcatenationLayer *concat26 = network->addConcatenation(concat26_inputs, 2); + nvinfer1::IElementWiseLayer *conv27 = + C2F(network, weightMap, *concat26->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.27"); + + /******************************************************************************************************* + ********************************************* YOLOV8 OUTPUT ****************************************** + *******************************************************************************************************/ +// int ch_0 = conv18->getOutput(0)->getDimensions().d[1]; +// int base_in_channel = std::max(16, std::max(ch_0 / 4, 64)); +// int base_out_channel = std::max(ch_0, std::min(kNumClass, 100)); + int base_in_channel = 64; + int base_out_channel = (gw == 0.25) ? std::max(32, std::min(kNumClass, 100)) : get_width(128, gw, max_channels); + + // output0 + nvinfer1::IElementWiseLayer *conv28_cv2_0_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.0"); + nvinfer1::IElementWiseLayer *conv28_cv2_0_1 = + convBnSiLU(network, weightMap, *conv28_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.1"); + nvinfer1::IConvolutionLayer *conv28_cv2_0_2 = + network->addConvolutionNd(*conv28_cv2_0_1->getOutput(0), base_in_channel, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv2.0.2.weight"], weightMap["model.28.cv2.0.2.bias"]); + conv28_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + + nvinfer1::IElementWiseLayer *conv28_cv3_0_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.0.0"); + nvinfer1::IElementWiseLayer *conv28_cv3_0_1 = + convBnSiLU(network, weightMap, *conv28_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1, + "model.28.cv3.0.1"); + nvinfer1::IConvolutionLayer *conv28_cv3_0_2 = + network->addConvolutionNd(*conv28_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv3.0.2.weight"], weightMap["model.28.cv3.0.2.bias"]); + conv28_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor28_0[] = {conv28_cv2_0_2->getOutput(0), conv28_cv3_0_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat28_0 = network->addConcatenation(inputTensor28_0, 2); + + // output1 + nvinfer1::IElementWiseLayer *conv28_cv2_1_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.0"); + nvinfer1::IElementWiseLayer *conv28_cv2_1_1 = + convBnSiLU(network, weightMap, *conv28_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.1"); + nvinfer1::IConvolutionLayer *conv28_cv2_1_2 = + network->addConvolutionNd(*conv28_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv2.1.2.weight"], weightMap["model.28.cv2.1.2.bias"]); + conv28_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + + nvinfer1::IElementWiseLayer *conv28_cv3_1_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.1.0"); + nvinfer1::IElementWiseLayer *conv28_cv3_1_1 = convBnSiLU(network, weightMap, *conv28_cv3_1_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.28.cv3.1.1"); + nvinfer1::IConvolutionLayer *conv28_cv3_1_2 = + network->addConvolutionNd(*conv28_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv3.1.2.weight"], weightMap["model.28.cv3.1.2.bias"]); + conv28_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor28_1[] = {conv28_cv2_1_2->getOutput(0), conv28_cv3_1_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat28_1 = network->addConcatenation(inputTensor28_1, 2); + + // output2 + nvinfer1::IElementWiseLayer *conv28_cv2_2_0 = + convBnSiLU(network, weightMap, *conv24->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.0"); + nvinfer1::IElementWiseLayer *conv28_cv2_2_1 = + convBnSiLU(network, weightMap, *conv28_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.1"); + nvinfer1::IConvolutionLayer *conv28_cv2_2_2 = + network->addConvolutionNd(*conv28_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv2.2.2.weight"], weightMap["model.28.cv2.2.2.bias"]); + conv28_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + + nvinfer1::IElementWiseLayer *conv28_cv3_2_0 = + convBnSiLU(network, weightMap, *conv24->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.2.0"); + nvinfer1::IElementWiseLayer *conv28_cv3_2_1 = convBnSiLU(network, weightMap, *conv28_cv3_2_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.28.cv3.2.1"); + nvinfer1::IConvolutionLayer *conv28_cv3_2_2 = + network->addConvolutionNd(*conv28_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv3.2.2.weight"], weightMap["model.28.cv3.2.2.bias"]); + conv28_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor28_2[] = {conv28_cv2_2_2->getOutput(0), conv28_cv3_2_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat28_2 = network->addConcatenation(inputTensor28_2, 2); + + // output3 + nvinfer1::IElementWiseLayer *conv28_cv2_3_0 = + convBnSiLU(network, weightMap, *conv27->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.0"); + nvinfer1::IElementWiseLayer *conv28_cv2_3_1 = + convBnSiLU(network, weightMap, *conv28_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.1"); + nvinfer1::IConvolutionLayer *conv28_cv2_3_2 = + network->addConvolutionNd(*conv28_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv2.3.2.weight"], weightMap["model.28.cv2.3.2.bias"]); + conv28_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + + nvinfer1::IElementWiseLayer *conv28_cv3_3_0 = + convBnSiLU(network, weightMap, *conv27->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.3.0"); + nvinfer1::IElementWiseLayer *conv28_cv3_3_1 = + convBnSiLU(network, weightMap, *conv28_cv3_3_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.28.cv3.3.1"); + nvinfer1::IConvolutionLayer *conv28_cv3_3_2 = + network->addConvolutionNd(*conv28_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.28.cv3.3.2.weight"], weightMap["model.28.cv3.3.2.bias"]); + conv28_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv28_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor28_3[] = {conv28_cv2_3_2->getOutput(0), conv28_cv3_3_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat28_3 = network->addConcatenation(inputTensor28_3, 2); + + /******************************************************************************************************* + ********************************************* YOLOV8 DETECT ****************************************** + *******************************************************************************************************/ + + nvinfer1::IElementWiseLayer *conv_layers[] = {conv1, conv3, conv5, conv7}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); + int stridesLength = sizeof(strides) / sizeof(int); + + // P2 processing steps (remains unchanged) + nvinfer1::IShuffleLayer *shuffle28_0 = network->addShuffle(*cat28_0->getOutput(0)); + shuffle28_0->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer *split28_0_0 = network->addSlice( + *shuffle28_0->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split28_0_1 = network->addSlice( + *shuffle28_0->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl28_0 = + DFL(network, weightMap, *split28_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.28.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor28_dfl_0[] = {dfl28_0->getOutput(0), split28_0_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat28_dfl_0 = network->addConcatenation(inputTensor28_dfl_0, 2); + cat28_dfl_0->setAxis(1); + + // P3 processing steps (remains unchanged) + nvinfer1::IShuffleLayer *shuffle28_1 = network->addShuffle(*cat28_1->getOutput(0)); + shuffle28_1->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer *split28_1_0 = network->addSlice( + *shuffle28_1->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split28_1_1 = network->addSlice( + *shuffle28_1->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl28_1 = + DFL(network, weightMap, *split28_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.28.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor28_dfl_1[] = {dfl28_1->getOutput(0), split28_1_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat28_dfl_1 = network->addConcatenation(inputTensor28_dfl_1, 2); + cat28_dfl_1->setAxis(1); + + // P4 processing steps (remains unchanged) + nvinfer1::IShuffleLayer *shuffle28_2 = network->addShuffle(*cat28_2->getOutput(0)); + shuffle28_2->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer *split28_2_0 = network->addSlice( + *shuffle28_2->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split28_2_1 = network->addSlice( + *shuffle28_2->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl28_2 = + DFL(network, weightMap, *split28_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.28.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor28_dfl_2[] = {dfl28_2->getOutput(0), split28_2_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat28_dfl_2 = network->addConcatenation(inputTensor28_dfl_2, 2); + cat28_dfl_2->setAxis(1); + + // P5 processing steps + nvinfer1::IShuffleLayer *shuffle28_3 = network->addShuffle(*cat28_3->getOutput(0)); + shuffle28_3->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}); + nvinfer1::ISliceLayer *split28_3_0 = network->addSlice( + *shuffle28_3->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[3]) * (kInputW / strides[3])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split28_3_1 = network->addSlice( + *shuffle28_3->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl28_3 = + DFL(network, weightMap, *split28_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1, + 1, 0, "model.28.dfl.conv.weight"); + nvinfer1::ITensor *inputTensor28_dfl_3[] = {dfl28_3->getOutput(0), split28_3_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat28_dfl_3 = network->addConcatenation(inputTensor28_dfl_3, 2); + cat28_dfl_3->setAxis(1); + + nvinfer1::IPluginV2Layer *yolo = addYoLoLayer( + network, std::vector{cat28_dfl_0, cat28_dfl_1, cat28_dfl_2, cat28_dfl_3}, + strides, stridesLength, false, false); + yolo->getOutput(0)->setName(kOutputTensorName); + network->markOutput(*yolo->getOutput(0)); + + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); + +#if defined(USE_FP16) + config->setFlag(nvinfer1::BuilderFlag::kFP16); +#elif defined(USE_INT8) + std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; + assert(builder->platformHasFastInt8()); + config->setFlag(nvinfer1::BuilderFlag::kINT8); + auto *calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", + kInputTensorName); + config->setInt8Calibrator(calibrator); +#endif + + std::cout << "Building engine, please wait for a while..." << std::endl; + nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + delete network; + + for (auto &mem: weightMap) { + free((void *) (mem.second.values)); + } + return serialized_model; +} + +static nvinfer1::IElementWiseLayer *convBnSiLUProto(nvinfer1::INetworkDefinition *network, + std::map weightMap, + nvinfer1::ITensor &input, + int ch, int k, int s, int p, std::string lname) { + nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0}; + nvinfer1::IConvolutionLayer *conv = + network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty); + assert(conv); + conv->setStrideNd(nvinfer1::DimsHW{s, s}); + conv->setPaddingNd(nvinfer1::DimsHW{p, p}); + conv->setName((lname + ".conv").c_str()); + + nvinfer1::IScaleLayer *bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3); + bn->setName((lname + ".bn").c_str()); + // This concat operator is not used for calculation, in order to prevent the operator fusion unrealized error when int8 is quantized. + // Error Code 10: Internal Error (Could not find any implementation for node + // model.22.proto.cv3.conv + model.22.proto.cv3.sigmoid + PWN(PWN((Unnamed Layer* 353) [Activation]), PWN(model.22.proto.cv3.silu)).) + +#if defined(USE_INT8) + nvinfer1::ITensor *inputTensors[] = {bn->getOutput(0)}; + auto concat = network->addConcatenation(inputTensors, 1); + nvinfer1::IActivationLayer *sigmoid = network->addActivation(*concat->getOutput(0), nvinfer1::ActivationType::kSIGMOID); + assert(sigmoid); + bn->setName((lname + ".sigmoid").c_str()); + nvinfer1::IElementWiseLayer *ew = + network->addElementWise(*concat->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); + assert(ew); + ew->setName((lname + ".silu").c_str()); +#else + nvinfer1::IActivationLayer *sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID); + assert(sigmoid); + bn->setName((lname + ".sigmoid").c_str()); + nvinfer1::IElementWiseLayer *ew = + network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD); + assert(ew); + ew->setName((lname + ".silu").c_str()); +#endif + return ew; +} + +static nvinfer1::IElementWiseLayer *Proto(nvinfer1::INetworkDefinition *network, + std::map &weightMap, nvinfer1::ITensor &input, + std::string lname, float gw, int max_channels) { + int mid_channel = get_width(256, gw, max_channels); + auto cv1 = convBnSiLU(network, weightMap, input, mid_channel, 3, 1, 1, "model.22.proto.cv1"); + float *convTranpsose_bais = (float *) weightMap["model.22.proto.upsample.bias"].values; + int convTranpsose_bais_len = weightMap["model.22.proto.upsample.bias"].count; + nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, convTranpsose_bais, convTranpsose_bais_len}; + auto convTranpsose = network->addDeconvolutionNd(*cv1->getOutput(0), mid_channel, nvinfer1::DimsHW{2, 2}, + weightMap["model.22.proto.upsample.weight"], bias); + assert(convTranpsose); + convTranpsose->setStrideNd(nvinfer1::DimsHW{2, 2}); + auto cv2 = convBnSiLU(network, weightMap, *convTranpsose->getOutput(0), mid_channel, 3, 1, 1, + "model.22.proto.cv2"); + auto cv3 = convBnSiLUProto(network, weightMap, *cv2->getOutput(0), 32, 1, 1, 0, "model.22.proto.cv3"); + assert(cv3); + return cv3; +} + +static nvinfer1::IShuffleLayer *cv4_conv_combined(nvinfer1::INetworkDefinition *network, + std::map &weightMap, + nvinfer1::ITensor &input, std::string lname, int grid_shape, float gw, + std::string algo_type) { + int mid_channle = 0; + int output_channel = 0; + + if (algo_type == "seg") { + if (gw == 0.25 || gw == 0.5) { + mid_channle = 32; + } else if (gw == 0.75) { + mid_channle = 48; + } else if (gw == 1.00) { + mid_channle = 64; + } else if (gw == 1.25) { + mid_channle = 80; + } + + output_channel = 32; + } else if (algo_type == "pose") { + std::string bn_weight_key = lname + ".0.bn.weight"; + mid_channle = weightMap[bn_weight_key].count; + output_channel = kNumberOfPoints * 3; + } + + auto cv0 = convBnSiLU(network, weightMap, input, mid_channle, 3, 1, 1, lname + ".0"); + auto cv1 = convBnSiLU(network, weightMap, *cv0->getOutput(0), mid_channle, 3, 1, 1, lname + ".1"); + float *cv2_bais_value = (float *) weightMap[lname + ".2" + ".bias"].values; + int cv2_bais_len = weightMap[lname + ".2" + ".bias"].count; + nvinfer1::Weights cv2_bais{nvinfer1::DataType::kFLOAT, cv2_bais_value, cv2_bais_len}; + auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), output_channel, nvinfer1::DimsHW{1, 1}, + weightMap[lname + ".2" + ".weight"], cv2_bais); + cv2->setStrideNd(nvinfer1::DimsHW{1, 1}); + nvinfer1::IShuffleLayer *cv2_shuffle = network->addShuffle(*cv2->getOutput(0)); + cv2_shuffle->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, output_channel, grid_shape}); + + return cv2_shuffle; +} + +nvinfer1::IHostMemory * +buildEngineYolov8Seg(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt, + const std::string &wts_path, float &gd, float &gw, int &max_channels) { + std::map weightMap = loadWeights(wts_path); +// nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); + nvinfer1::INetworkDefinition *network = builder->createNetworkV2( + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + + /******************************************************************************************************* + ****************************************** YOLOV8 INPUT ********************************************** + *******************************************************************************************************/ + nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); + assert(data); + + /******************************************************************************************************* + ***************************************** YOLOV8 BACKBONE ******************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer *conv0 = + convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); + nvinfer1::IElementWiseLayer *conv1 = + convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); + nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); + nvinfer1::IElementWiseLayer *conv3 = + convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); + nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); + nvinfer1::IElementWiseLayer *conv5 = + convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); + nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + nvinfer1::IElementWiseLayer *conv7 = + convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); + nvinfer1::IElementWiseLayer *conv8 = + C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + nvinfer1::IElementWiseLayer *conv9 = + SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), 5, "model.9"); + + /******************************************************************************************************* + ********************************************* YOLOV8 HEAD ******************************************** + *******************************************************************************************************/ + float scale[] = {1.0, 1.0, 2.0, 2.0}; + nvinfer1::IResizeLayer *upsample10 = network->addResize(*conv9->getOutput(0)); + assert(upsample10); + upsample10->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample10->setScales(scale, 4); + + nvinfer1::ITensor *inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat11 = network->addConcatenation(inputTensor11, 2); + nvinfer1::IElementWiseLayer *conv12 = + C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); + + nvinfer1::IResizeLayer *upsample13 = network->addResize(*conv12->getOutput(0)); + assert(upsample13); + upsample13->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample13->setScales(scale, 4); + + nvinfer1::ITensor *inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat14 = network->addConcatenation(inputTensor14, 2); + nvinfer1::IElementWiseLayer *conv15 = + C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); + nvinfer1::IElementWiseLayer *conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), + get_width(256, gw, max_channels), 3, 2, 1, "model.16"); + nvinfer1::ITensor *inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat17 = network->addConcatenation(inputTensor17, 2); + nvinfer1::IElementWiseLayer *conv18 = + C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); + nvinfer1::IElementWiseLayer *conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), + get_width(512, gw, max_channels), 3, 2, 1, "model.19"); + nvinfer1::ITensor *inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat20 = network->addConcatenation(inputTensor20, 2); + nvinfer1::IElementWiseLayer *conv21 = + C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); + + /******************************************************************************************************* + ********************************************* YOLOV8 OUTPUT ****************************************** + *******************************************************************************************************/ + int base_in_channel = (gw == 1.25) ? 80 : 64; + int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels); + + // output0 + nvinfer1::IElementWiseLayer *conv22_cv2_0_0 = + convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0"); + nvinfer1::IElementWiseLayer *conv22_cv2_0_1 = + convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1"); + nvinfer1::IConvolutionLayer *conv22_cv2_0_2 = + network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]); + conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer *conv22_cv3_0_0 = + convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0"); + nvinfer1::IElementWiseLayer *conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.0.1"); + nvinfer1::IConvolutionLayer *conv22_cv3_0_2 = + network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]); + conv22_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_0 = network->addConcatenation(inputTensor22_0, 2); + + // output1 + nvinfer1::IElementWiseLayer *conv22_cv2_1_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0"); + nvinfer1::IElementWiseLayer *conv22_cv2_1_1 = + convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1"); + nvinfer1::IConvolutionLayer *conv22_cv2_1_2 = + network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]); + conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer *conv22_cv3_1_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0"); + nvinfer1::IElementWiseLayer *conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.1.1"); + nvinfer1::IConvolutionLayer *conv22_cv3_1_2 = + network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]); + conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_1 = network->addConcatenation(inputTensor22_1, 2); + + // output2 + nvinfer1::IElementWiseLayer *conv22_cv2_2_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0"); + nvinfer1::IElementWiseLayer *conv22_cv2_2_1 = + convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1"); + nvinfer1::IConvolutionLayer *conv22_cv2_2_2 = + network->addConvolutionNd(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]); + nvinfer1::IElementWiseLayer *conv22_cv3_2_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0"); + nvinfer1::IElementWiseLayer *conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.2.1"); + nvinfer1::IConvolutionLayer *conv22_cv3_2_2 = + network->addConvolutionNd(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]); + nvinfer1::ITensor *inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_2 = network->addConcatenation(inputTensor22_2, 2); + + /******************************************************************************************************* + ********************************************* YOLOV8 DETECT ****************************************** + *******************************************************************************************************/ + + nvinfer1::IElementWiseLayer *conv_layers[] = {conv3, conv5, conv7}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); + int stridesLength = sizeof(strides) / sizeof(int); + + nvinfer1::IShuffleLayer *shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); + shuffle22_0->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer *split22_0_0 = network->addSlice( + *shuffle22_0->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split22_0_1 = network->addSlice( + *shuffle22_0->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl22_0 = + DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.22.dfl.conv.weight"); + + nvinfer1::IShuffleLayer *shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); + shuffle22_1->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer *split22_1_0 = network->addSlice( + *shuffle22_1->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split22_1_1 = network->addSlice( + *shuffle22_1->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl22_1 = + DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.22.dfl.conv.weight"); + + nvinfer1::IShuffleLayer *shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); + shuffle22_2->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer *split22_2_0 = network->addSlice( + *shuffle22_2->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split22_2_1 = network->addSlice( + *shuffle22_2->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl22_2 = + DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.22.dfl.conv.weight"); + + // det0 + auto proto_coef_0 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", + (kInputH / strides[0]) * (kInputW / strides[0]), gw, "seg"); + nvinfer1::ITensor *inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0), + proto_coef_0->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3); + cat22_dfl_0->setAxis(1); + + // det1 + auto proto_coef_1 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1", + (kInputH / strides[1]) * (kInputW / strides[1]), gw, "seg"); + nvinfer1::ITensor *inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0), + proto_coef_1->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3); + cat22_dfl_1->setAxis(1); + + // det2 + auto proto_coef_2 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2", + (kInputH / strides[2]) * (kInputW / strides[2]), gw, "seg"); + nvinfer1::ITensor *inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0), + proto_coef_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3); + cat22_dfl_2->setAxis(1); + + nvinfer1::IPluginV2Layer *yolo = + addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, + strides, stridesLength, true, false); + yolo->getOutput(0)->setName(kOutputTensorName); + network->markOutput(*yolo->getOutput(0)); + + auto proto = Proto(network, weightMap, *conv15->getOutput(0), "model.22.proto", gw, max_channels); + proto->getOutput(0)->setName(kProtoTensorName); + network->markOutput(*proto->getOutput(0)); + + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); + +#if defined(USE_FP16) + config->setFlag(nvinfer1::BuilderFlag::kFP16); +#elif defined(USE_INT8) + std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; + assert(builder->platformHasFastInt8()); + config->setFlag(nvinfer1::BuilderFlag::kINT8); + auto *calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", + kInputTensorName); + config->setInt8Calibrator(calibrator); +#endif + + std::cout << "Building engine, please wait for a while..." << std::endl; + nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + delete network; + + for (auto &mem: weightMap) { + free((void *) (mem.second.values)); + } + return serialized_model; +} + +nvinfer1::IHostMemory *buildEngineYolov8Pose(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, + nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw, + int &max_channels) { + std::map weightMap = loadWeights(wts_path); +// nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); + nvinfer1::INetworkDefinition *network = builder->createNetworkV2( + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + + /******************************************************************************************************* + ****************************************** YOLOV8 INPUT ********************************************** + *******************************************************************************************************/ + nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); + assert(data); + + /******************************************************************************************************* + ***************************************** YOLOV8 BACKBONE ******************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer *conv0 = + convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); + nvinfer1::IElementWiseLayer *conv1 = + convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); + nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); + nvinfer1::IElementWiseLayer *conv3 = + convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); + nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); + nvinfer1::IElementWiseLayer *conv5 = + convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); + nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + nvinfer1::IElementWiseLayer *conv7 = + convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7"); + nvinfer1::IElementWiseLayer *conv8 = + C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + nvinfer1::IElementWiseLayer *conv9 = + SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), 5, "model.9"); + /******************************************************************************************************* + ********************************************* YOLOV8 HEAD ******************************************** + *******************************************************************************************************/ + float scale[] = {1.0, 1.0, 2.0, 2.0}; + nvinfer1::IResizeLayer *upsample10 = network->addResize(*conv9->getOutput(0)); + assert(upsample10); + upsample10->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample10->setScales(scale, 4); + + nvinfer1::ITensor *inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat11 = network->addConcatenation(inputTensor11, 2); + nvinfer1::IElementWiseLayer *conv12 = + C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12"); + + nvinfer1::IResizeLayer *upsample13 = network->addResize(*conv12->getOutput(0)); + assert(upsample13); + upsample13->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample13->setScales(scale, 4); + + nvinfer1::ITensor *inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat14 = network->addConcatenation(inputTensor14, 2); + nvinfer1::IElementWiseLayer *conv15 = + C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15"); + nvinfer1::IElementWiseLayer *conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0), + get_width(256, gw, max_channels), 3, 2, 1, "model.16"); + nvinfer1::ITensor *inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat17 = network->addConcatenation(inputTensor17, 2); + nvinfer1::IElementWiseLayer *conv18 = + C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18"); + nvinfer1::IElementWiseLayer *conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0), + get_width(512, gw, max_channels), 3, 2, 1, "model.19"); + nvinfer1::ITensor *inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat20 = network->addConcatenation(inputTensor20, 2); + nvinfer1::IElementWiseLayer *conv21 = + C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21"); + + /******************************************************************************************************* + ********************************************* YOLOV8 OUTPUT ****************************************** + *******************************************************************************************************/ + int base_in_channel = (gw == 1.25) ? 80 : 64; + int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kPoseNumClass, 100)) : get_width(256, gw, max_channels); + + // output0 + nvinfer1::IElementWiseLayer *conv22_cv2_0_0 = + convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0"); + nvinfer1::IElementWiseLayer *conv22_cv2_0_1 = + convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1"); + nvinfer1::IConvolutionLayer *conv22_cv2_0_2 = + network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]); + conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer *conv22_cv3_0_0 = + convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0"); + nvinfer1::IElementWiseLayer *conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.0.1"); + nvinfer1::IConvolutionLayer *conv22_cv3_0_2 = + network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]); + conv22_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_0 = network->addConcatenation(inputTensor22_0, 2); + + // output1 + nvinfer1::IElementWiseLayer *conv22_cv2_1_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0"); + nvinfer1::IElementWiseLayer *conv22_cv2_1_1 = + convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1"); + nvinfer1::IConvolutionLayer *conv22_cv2_1_2 = + network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]); + conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer *conv22_cv3_1_0 = + convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0"); + nvinfer1::IElementWiseLayer *conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.1.1"); + nvinfer1::IConvolutionLayer *conv22_cv3_1_2 = + network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]); + conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_1 = network->addConcatenation(inputTensor22_1, 2); + + // output2 + nvinfer1::IElementWiseLayer *conv22_cv2_2_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0"); + nvinfer1::IElementWiseLayer *conv22_cv2_2_1 = + convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1"); + nvinfer1::IConvolutionLayer *conv22_cv2_2_2 = + network->addConvolutionNd(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]); + nvinfer1::IElementWiseLayer *conv22_cv3_2_0 = + convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0"); + nvinfer1::IElementWiseLayer *conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.22.cv3.2.1"); + nvinfer1::IConvolutionLayer *conv22_cv3_2_2 = + network->addConvolutionNd(*conv22_cv3_2_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]); + nvinfer1::ITensor *inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_2 = network->addConcatenation(inputTensor22_2, 2); + /******************************************************************************************************* + ********************************************* YOLOV8 DETECT ****************************************** + *******************************************************************************************************/ + + nvinfer1::IElementWiseLayer *conv_layers[] = {conv3, conv5, conv7}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); + int stridesLength = sizeof(strides) / sizeof(int); + + /**************************************************************************************P3****************************************************************************************************************************************/ + nvinfer1::IShuffleLayer *shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0)); + shuffle22_0->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer *split22_0_0 = network->addSlice( + *shuffle22_0->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split22_0_1 = network->addSlice( + *shuffle22_0->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl22_0 = + DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.22.dfl.conv.weight"); + + // det0 + auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0", + (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose"); + + nvinfer1::ITensor *inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0), + shuffle_conv15->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3); + cat22_dfl_0->setAxis(1); + + /********************************************************************************************P4**********************************************************************************************************************************/ + nvinfer1::IShuffleLayer *shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0)); + shuffle22_1->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer *split22_1_0 = network->addSlice( + *shuffle22_1->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split22_1_1 = network->addSlice( + *shuffle22_1->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl22_1 = + DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.22.dfl.conv.weight"); + + // det1 + auto shuffle_conv18 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1", + (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose"); + + nvinfer1::ITensor *inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0), + shuffle_conv18->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3); + cat22_dfl_1->setAxis(1); + + /********************************************************************************************P5**********************************************************************************************************************************/ + nvinfer1::IShuffleLayer *shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0)); + shuffle22_2->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer *split22_2_0 = network->addSlice( + *shuffle22_2->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split22_2_1 = network->addSlice( + *shuffle22_2->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl22_2 = + DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.22.dfl.conv.weight"); + + // det2 + auto shuffle_conv21 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2", + (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose"); + nvinfer1::ITensor *inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0), + shuffle_conv21->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3); + cat22_dfl_2->setAxis(1); + + nvinfer1::IPluginV2Layer *yolo = + addYoLoLayer(network, std::vector{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2}, + strides, stridesLength, false, true); + yolo->getOutput(0)->setName(kOutputTensorName); + network->markOutput(*yolo->getOutput(0)); + + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); + +#if defined(USE_FP16) + config->setFlag(nvinfer1::BuilderFlag::kFP16); +#elif defined(USE_INT8) + std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; + assert(builder->platformHasFastInt8()); + config->setFlag(nvinfer1::BuilderFlag::kINT8); + auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", + kInputTensorName); + config->setInt8Calibrator(calibrator); +#endif + + std::cout << "Building engine, please wait for a while..." << std::endl; + nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + delete network; + + for (auto &mem: weightMap) { + free((void *) (mem.second.values)); + } + return serialized_model; +} + +nvinfer1::IHostMemory * +buildEngineYolov8PoseP6(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt, + const std::string &wts_path, float &gd, float &gw, int &max_channels) { + std::map weightMap = loadWeights(wts_path); +// nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U); + nvinfer1::INetworkDefinition *network = builder->createNetworkV2( + 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)); + /******************************************************************************************************* + ****************************************** YOLOV8 INPUT ********************************************** + *******************************************************************************************************/ + nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW}); + assert(data); + /******************************************************************************************************* + ***************************************** YOLOV8 BACKBONE ******************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer *conv0 = + convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0"); + nvinfer1::IElementWiseLayer *conv1 = + convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1"); + // 11233 + nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels), + get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2"); + nvinfer1::IElementWiseLayer *conv3 = + convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3"); + // 22466 + nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4"); + nvinfer1::IElementWiseLayer *conv5 = + convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5"); + // 22466 + nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6"); + + nvinfer1::IElementWiseLayer *conv7 = + convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7"); + nvinfer1::IElementWiseLayer *conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels), + get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8"); + + nvinfer1::IElementWiseLayer *conv9 = + convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9"); + nvinfer1::IElementWiseLayer *conv10 = + C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10"); + + nvinfer1::IElementWiseLayer *conv11 = + SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), 5, "model.11"); + + /******************************************************************************************************* + ********************************************* YOLOV8 HEAD ******************************************** + *******************************************************************************************************/ + // Head + float scale[] = {1.0, 1.0, 2.0, 2.0}; // scale used for upsampling + + // P5 + nvinfer1::IResizeLayer *upsample12 = network->addResize(*conv11->getOutput(0)); + upsample12->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample12->setScales(scale, 4); + nvinfer1::ITensor *concat13_inputs[] = {upsample12->getOutput(0), conv8->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat13 = network->addConcatenation(concat13_inputs, 2); + nvinfer1::IElementWiseLayer *conv14 = + C2(network, weightMap, *concat13->getOutput(0), get_width(768, gw, max_channels), + get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.14"); + + // P4 + nvinfer1::IResizeLayer *upsample15 = network->addResize(*conv14->getOutput(0)); + upsample15->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample15->setScales(scale, 4); + nvinfer1::ITensor *concat16_inputs[] = {upsample15->getOutput(0), conv6->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat16 = network->addConcatenation(concat16_inputs, 2); + nvinfer1::IElementWiseLayer *conv17 = + C2(network, weightMap, *concat16->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.17"); + + // P3 + nvinfer1::IResizeLayer *upsample18 = network->addResize(*conv17->getOutput(0)); + upsample18->setResizeMode(nvinfer1::InterpolationMode::kNEAREST); + upsample18->setScales(scale, 4); + nvinfer1::ITensor *concat19_inputs[] = {upsample18->getOutput(0), conv4->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat19 = network->addConcatenation(concat19_inputs, 2); + nvinfer1::IElementWiseLayer *conv20 = + C2(network, weightMap, *concat19->getOutput(0), get_width(256, gw, max_channels), + get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.20"); + + // Additional layers for P4, P5, P6 + // P4/16-medium + nvinfer1::IElementWiseLayer *conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0), + get_width(256, gw, max_channels), 3, 2, 1, "model.21"); + nvinfer1::ITensor *concat22_inputs[] = {conv21->getOutput(0), conv17->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat22 = network->addConcatenation(concat22_inputs, 2); + nvinfer1::IElementWiseLayer *conv23 = + C2(network, weightMap, *concat22->getOutput(0), get_width(512, gw, max_channels), + get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.23"); + + // P5/32-large + nvinfer1::IElementWiseLayer *conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0), + get_width(512, gw, max_channels), 3, 2, 1, "model.24"); + nvinfer1::ITensor *concat25_inputs[] = {conv24->getOutput(0), conv14->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat25 = network->addConcatenation(concat25_inputs, 2); + nvinfer1::IElementWiseLayer *conv26 = + C2(network, weightMap, *concat25->getOutput(0), get_width(768, gw, max_channels), + get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.26"); + + // P6/64-xlarge + nvinfer1::IElementWiseLayer *conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0), + get_width(768, gw, max_channels), 3, 2, 1, "model.27"); + nvinfer1::ITensor *concat28_inputs[] = {conv27->getOutput(0), conv11->getOutput(0)}; + nvinfer1::IConcatenationLayer *concat28 = network->addConcatenation(concat28_inputs, 2); + nvinfer1::IElementWiseLayer *conv29 = + C2(network, weightMap, *concat28->getOutput(0), get_width(1024, gw, max_channels), + get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29"); + + /******************************************************************************************************* + ********************************************* YOLOV8 OUTPUT ****************************************** + *******************************************************************************************************/ + int base_in_channel = (gw == 1.25) ? 80 : 64; + int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kPoseNumClass, 100)) : get_width(256, gw, max_channels); + + // output0 + nvinfer1::IElementWiseLayer *conv30_cv2_0_0 = + convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.0"); + nvinfer1::IElementWiseLayer *conv30_cv2_0_1 = + convBnSiLU(network, weightMap, *conv30_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.1"); + nvinfer1::IConvolutionLayer *conv30_cv2_0_2 = + network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]); + conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + + conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + + nvinfer1::IElementWiseLayer *conv30_cv3_0_0 = + convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0"); + + nvinfer1::IElementWiseLayer *conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.30.cv3.0.1"); + nvinfer1::IConvolutionLayer *conv30_cv3_0_2 = + network->addConvolutionNd(*conv30_cv3_0_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.0.2.weight"], weightMap["model.30.cv3.0.2.bias"]); + conv30_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor30_0[] = {conv30_cv2_0_2->getOutput(0), conv30_cv3_0_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_0 = network->addConcatenation(inputTensor30_0, 2); + + // output1 + nvinfer1::IElementWiseLayer *conv30_cv2_1_0 = + convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.0"); + nvinfer1::IElementWiseLayer *conv30_cv2_1_1 = + convBnSiLU(network, weightMap, *conv30_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.1"); + nvinfer1::IConvolutionLayer *conv30_cv2_1_2 = + network->addConvolutionNd(*conv30_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.1.2.weight"], weightMap["model.30.cv2.1.2.bias"]); + conv30_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer *conv30_cv3_1_0 = + convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.0"); + nvinfer1::IElementWiseLayer *conv30_cv3_1_1 = convBnSiLU(network, weightMap, *conv30_cv3_1_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.30.cv3.1.1"); + nvinfer1::IConvolutionLayer *conv30_cv3_1_2 = + network->addConvolutionNd(*conv30_cv3_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.1.2.weight"], weightMap["model.30.cv3.1.2.bias"]); + conv30_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor30_1[] = {conv30_cv2_1_2->getOutput(0), conv30_cv3_1_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_1 = network->addConcatenation(inputTensor30_1, 2); + + // output2 + nvinfer1::IElementWiseLayer *conv30_cv2_2_0 = + convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.0"); + nvinfer1::IElementWiseLayer *conv30_cv2_2_1 = + convBnSiLU(network, weightMap, *conv30_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.1"); + nvinfer1::IConvolutionLayer *conv30_cv2_2_2 = + network->addConvolutionNd(*conv30_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.2.2.weight"], weightMap["model.30.cv2.2.2.bias"]); + conv30_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer *conv30_cv3_2_0 = + convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.0"); + nvinfer1::IElementWiseLayer *conv30_cv3_2_1 = convBnSiLU(network, weightMap, *conv30_cv3_2_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.30.cv3.2.1"); + nvinfer1::IConvolutionLayer *conv30_cv3_2_2 = + network->addConvolutionNd(*conv30_cv3_2_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.2.2.weight"], weightMap["model.30.cv3.2.2.bias"]); + conv30_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor30_2[] = {conv30_cv2_2_2->getOutput(0), conv30_cv3_2_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_2 = network->addConcatenation(inputTensor30_2, 2); + + // output3 + nvinfer1::IElementWiseLayer *conv30_cv2_3_0 = + convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.0"); + nvinfer1::IElementWiseLayer *conv30_cv2_3_1 = + convBnSiLU(network, weightMap, *conv30_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.1"); + nvinfer1::IConvolutionLayer *conv30_cv2_3_2 = + network->addConvolutionNd(*conv30_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv2.3.2.weight"], weightMap["model.30.cv2.3.2.bias"]); + conv30_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::IElementWiseLayer *conv30_cv3_3_0 = + convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.0"); + nvinfer1::IElementWiseLayer *conv30_cv3_3_1 = convBnSiLU(network, weightMap, *conv30_cv3_3_0->getOutput(0), + base_out_channel, 3, 1, 1, "model.30.cv3.3.1"); + nvinfer1::IConvolutionLayer *conv30_cv3_3_2 = + network->addConvolutionNd(*conv30_cv3_3_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1}, + weightMap["model.30.cv3.3.2.weight"], weightMap["model.30.cv3.3.2.bias"]); + conv30_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1}); + conv30_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0}); + nvinfer1::ITensor *inputTensor30_3[] = {conv30_cv2_3_2->getOutput(0), conv30_cv3_3_2->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_3 = network->addConcatenation(inputTensor30_3, 2); + + /******************************************************************************************************* + ********************************************* YOLOV8 DETECT ****************************************** + *******************************************************************************************************/ + nvinfer1::IElementWiseLayer *conv_layers[] = {conv3, conv5, conv7, conv9}; + int strides[sizeof(conv_layers) / sizeof(conv_layers[0])]; + calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides); + int stridesLength = sizeof(strides) / sizeof(int); + + // P3 processing steps (remains unchanged) + nvinfer1::IShuffleLayer *shuffle30_0 = + network->addShuffle(*cat30_0->getOutput(0)); // Reusing the previous cat30_0 as P3 concatenation layer + shuffle30_0->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}); + nvinfer1::ISliceLayer *split30_0_0 = network->addSlice( + *shuffle30_0->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split30_0_1 = network->addSlice( + *shuffle30_0->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl30_0 = + DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1, + 1, 0, "model.30.dfl.conv.weight"); + + // det0 + auto shuffle_conv20 = cv4_conv_combined(network, weightMap, *conv20->getOutput(0), "model.30.cv4.0", + (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose"); + nvinfer1::ITensor *inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0), + shuffle_conv20->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 3); + cat30_dfl_0->setAxis(1); + + // P4 processing steps (remains unchanged) + nvinfer1::IShuffleLayer *shuffle30_1 = + network->addShuffle(*cat30_1->getOutput(0)); // Reusing the previous cat30_1 as P4 concatenation layer + shuffle30_1->setReshapeDimensions( + nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}); + nvinfer1::ISliceLayer *split30_1_0 = network->addSlice( + *shuffle30_1->getOutput(0), + nvinfer1::Dims3{0, 0, 0}, + nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::ISliceLayer *split30_1_1 = network->addSlice( + *shuffle30_1->getOutput(0), + nvinfer1::Dims3{0, 64, 0}, + nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])}, + nvinfer1::Dims3{1, 1, 1}); + nvinfer1::IShuffleLayer *dfl30_1 = + DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1, + 1, 0, "model.30.dfl.conv.weight"); + + // det1 + auto shuffle_conv23 = cv4_conv_combined(network, weightMap, *conv23->getOutput(0), "model.30.cv4.1", + (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose"); + nvinfer1::ITensor *inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0), + shuffle_conv23->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 3); + cat30_dfl_1->setAxis(1); + + // P5 processing steps (remains unchanged) + nvinfer1::IShuffleLayer *shuffle30_2 = + network->addShuffle(*cat30_2->getOutput(0)); // Reusing the previous cat30_2 as P5 concatenation layer + shuffle30_2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize,64 + kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}); + nvinfer1::ISliceLayer *split30_2_0 = network->addSlice( + *shuffle30_2->getOutput(0), + nvinfer1::Dims3{0,0, 0}, + nvinfer1::Dims3{kBatchSize,64, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1,1, 1}); + nvinfer1::ISliceLayer *split30_2_1 = network->addSlice( + *shuffle30_2->getOutput(0), + nvinfer1::Dims3{0,64, 0}, + nvinfer1::Dims3{kBatchSize,kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])}, + nvinfer1::Dims3{1,1, 1}); + nvinfer1::IShuffleLayer *dfl30_2 = + DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1, + 1, 0, "model.30.dfl.conv.weight"); + + // det2 + auto shuffle_conv26 = cv4_conv_combined(network, weightMap, *conv26->getOutput(0), "model.30.cv4.2", + (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose"); + nvinfer1::ITensor *inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0), + shuffle_conv26->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 3); + cat30_dfl_2->setAxis(1); + + // P6 processing steps + nvinfer1::IShuffleLayer *shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0)); + shuffle30_3->setReshapeDimensions(nvinfer1::Dims3{kBatchSize,64 + kPoseNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}); + nvinfer1::ISliceLayer *split30_3_0 = network->addSlice( + *shuffle30_3->getOutput(0), + nvinfer1::Dims3{0,0, 0}, + nvinfer1::Dims3{kBatchSize,64, (kInputH / strides[3]) * (kInputW / strides[3])}, + nvinfer1::Dims3{1,1, 1}); + nvinfer1::ISliceLayer *split30_3_1 = network->addSlice( + *shuffle30_3->getOutput(0), + nvinfer1::Dims3{0,64, 0}, + nvinfer1::Dims3{kBatchSize,kPoseNumClass, (kInputH / strides[3]) * (kInputW / strides[3])}, + nvinfer1::Dims3{1,1, 1}); + nvinfer1::IShuffleLayer *dfl30_3 = + DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1, + 1, 0, "model.30.dfl.conv.weight"); + + // det3 + auto shuffle_conv29 = cv4_conv_combined(network, weightMap, *conv29->getOutput(0), "model.30.cv4.3", + (kInputH / strides[3]) * (kInputW / strides[3]), gw, "pose"); + nvinfer1::ITensor *inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0), + shuffle_conv29->getOutput(0)}; + nvinfer1::IConcatenationLayer *cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 3); + cat30_dfl_3->setAxis(1); + + nvinfer1::IPluginV2Layer *yolo = addYoLoLayer( + network, std::vector{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3}, + strides, stridesLength, false, true); + yolo->getOutput(0)->setName(kOutputTensorName); + network->markOutput(*yolo->getOutput(0)); + + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20)); + +#if defined(USE_FP16) + config->setFlag(nvinfer1::BuilderFlag::kFP16); +#elif defined(USE_INT8) + std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl; + assert(builder->platformHasFastInt8()); + config->setFlag(nvinfer1::BuilderFlag::kINT8); + auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table", + kInputTensorName); + config->setInt8Calibrator(calibrator); +#endif + + std::cout << "Building engine, please wait for a while..." << std::endl; + nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + delete network; + + for (auto &mem: weightMap) { + free((void *) (mem.second.values)); + } + return serialized_model; +} diff --git a/src/postprocess.cpp b/src/postprocess.cpp new file mode 100644 index 0000000..f19acc0 --- /dev/null +++ b/src/postprocess.cpp @@ -0,0 +1,269 @@ +#include "postprocess.h" +#include "utils.h" + +cv::Rect get_rect(cv::Mat& img, float bbox[4]) { + float l, r, t, b; + float r_w = kInputW / (img.cols * 1.0); + float r_h = kInputH / (img.rows * 1.0); + + if (r_h > r_w) { + l = bbox[0]; + r = bbox[2]; + t = bbox[1] - (kInputH - r_w * img.rows) / 2; + b = bbox[3] - (kInputH - r_w * img.rows) / 2; + l = l / r_w; + r = r / r_w; + t = t / r_w; + b = b / r_w; + } else { + l = bbox[0] - (kInputW - r_h * img.cols) / 2; + r = bbox[2] - (kInputW - r_h * img.cols) / 2; + t = bbox[1]; + b = bbox[3]; + l = l / r_h; + r = r / r_h; + t = t / r_h; + b = b / r_h; + } + l = std::max(0.0f, l); + t = std::max(0.0f, t); + int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); + int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); + + return cv::Rect(int(round(l)), int(round(t)), width, height); +} + +cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) { + float l, r, t, b; + float r_w = kInputW / (img.cols * 1.0); + float r_h = kInputH / (img.rows * 1.0); + if (r_h > r_w) { + l = bbox[0] / r_w; + r = bbox[2] / r_w; + t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w; + b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w; + for (int i = 0; i < kNumberOfPoints * 3; i += 3) { + lmk[i] /= r_w; + lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w; + // lmk[i + 2] + } + } else { + l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h; + r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h; + t = bbox[1] / r_h; + b = bbox[3] / r_h; + for (int i = 0; i < kNumberOfPoints * 3; i += 3) { + lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h; + lmk[i + 1] /= r_h; + // lmk[i + 2] + } + } + l = std::max(0.0f, l); + t = std::max(0.0f, t); + int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l)))); + int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t)))); + + return cv::Rect(int(round(l)), int(round(t)), width, height); +} + +static float iou(float lbox[4], float rbox[4]) { + float interBox[] = { + (std::max)(lbox[0], rbox[0]), + (std::min)(lbox[2], rbox[2]), + (std::max)(lbox[1], rbox[1]), + (std::min)(lbox[3], rbox[3]), + }; + + if (interBox[2] > interBox[3] || interBox[0] > interBox[1]) + return 0.0f; + + float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]); + float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS; + return interBoxS / unionBoxS; +} + +static bool cmp(const Detection& a, const Detection& b) { + if (a.conf == b.conf) { + return a.bbox[0] < b.bbox[0]; + } + return a.conf > b.conf; +} + +void nms(std::vector& res, float* output, float conf_thresh, float nms_thresh) { + int det_size = sizeof(Detection) / sizeof(float); + std::map> m; + + for (int i = 0; i < output[0]; i++) { + if (output[1 + det_size * i + 4] <= conf_thresh) + continue; + Detection det; + memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float)); + if (m.count(det.class_id) == 0) + m.emplace(det.class_id, std::vector()); + m[det.class_id].push_back(det); + } + for (auto it = m.begin(); it != m.end(); it++) { + auto& dets = it->second; + std::sort(dets.begin(), dets.end(), cmp); + for (size_t m = 0; m < dets.size(); ++m) { + auto& item = dets[m]; + res.push_back(item); + for (size_t n = m + 1; n < dets.size(); ++n) { + if (iou(item.bbox, dets[n].bbox) > nms_thresh) { + dets.erase(dets.begin() + n); + --n; + } + } + } + } +} + +void batch_nms(std::vector>& res_batch, float* output, int batch_size, int output_size, + float conf_thresh, float nms_thresh) { + res_batch.resize(batch_size); + for (int i = 0; i < batch_size; i++) { + nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh); + } +} + +void process_decode_ptr_host(std::vector& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img, + int count) { + Detection det; + for (int i = 0; i < count; i++) { + int basic_pos = 1 + i * bbox_element; + int keep_flag = decode_ptr_host[basic_pos + 6]; + if (keep_flag == 1) { + det.bbox[0] = decode_ptr_host[basic_pos + 0]; + det.bbox[1] = decode_ptr_host[basic_pos + 1]; + det.bbox[2] = decode_ptr_host[basic_pos + 2]; + det.bbox[3] = decode_ptr_host[basic_pos + 3]; + det.conf = decode_ptr_host[basic_pos + 4]; + det.class_id = decode_ptr_host[basic_pos + 5]; + res.push_back(det); + } + } +} + +void batch_process(std::vector>& res_batch, const float* decode_ptr_host, int batch_size, + int bbox_element, const std::vector& img_batch) { + res_batch.resize(batch_size); + int count = static_cast(*decode_ptr_host); + count = std::min(count, kMaxNumOutputBbox); + for (int i = 0; i < batch_size; i++) { + auto& img = const_cast(img_batch[i]); + process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count); + } +} + +void draw_bbox(std::vector& img_batch, std::vector>& res_batch) { + for (size_t i = 0; i < img_batch.size(); i++) { + auto& res = res_batch[i]; + cv::Mat img = img_batch[i]; + for (size_t j = 0; j < res.size(); j++) { + cv::Rect r = get_rect(img, res[j].bbox); + cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); + cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, + cv::Scalar(0xFF, 0xFF, 0xFF), 2); + } + } +} + +void draw_bbox_keypoints_line(std::vector& img_batch, std::vector>& res_batch) { + const std::vector> skeleton_pairs = { + {0, 1}, {0, 2}, {0, 5}, {0, 6}, {1, 2}, {1, 3}, {2, 4}, {5, 6}, {5, 7}, {5, 11}, + {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}}; + + for (size_t i = 0; i < img_batch.size(); i++) { + auto& res = res_batch[i]; + cv::Mat img = img_batch[i]; + for (size_t j = 0; j < res.size(); j++) { + cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints); + cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); + cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, + cv::Scalar(0xFF, 0xFF, 0xFF), 2); + + for (int k = 0; k < kNumberOfPoints * 3; k += 3) { + if (res[j].keypoints[k + 2] > 0.5) { + cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3, + cv::Scalar(0, 0x27, 0xC1), -1); + } + } + + for (const auto& bone : skeleton_pairs) { + int kp1_idx = bone.first * 3; + int kp2_idx = bone.second * 3; + if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) { + cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]); + cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]); + cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2); + } + } + } + } +} + +cv::Mat scale_mask(cv::Mat mask, cv::Mat img) { + int x, y, w, h; + float r_w = kInputW / (img.cols * 1.0); + float r_h = kInputH / (img.rows * 1.0); + if (r_h > r_w) { + w = kInputW; + h = r_w * img.rows; + x = 0; + y = (kInputH - h) / 2; + } else { + w = r_h * img.cols; + h = kInputH; + x = (kInputW - w) / 2; + y = 0; + } + cv::Rect r(x, y, w, h); + cv::Mat res; + cv::resize(mask(r), res, img.size()); + return res; +} + +void draw_mask_bbox(cv::Mat& img, std::vector& dets, std::vector& masks, + std::unordered_map& labels_map) { + static std::vector colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17, + 0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF, + 0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7}; + for (size_t i = 0; i < dets.size(); i++) { + cv::Mat img_mask = scale_mask(masks[i], img); + auto color = colors[(int)dets[i].class_id % colors.size()]; + auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF); + + cv::Rect r = get_rect(img, dets[i].bbox); + for (int x = r.x; x < r.x + r.width; x++) { + for (int y = r.y; y < r.y + r.height; y++) { + float val = img_mask.at(y, x); + if (val <= 0.5) + continue; + img.at(y, x)[0] = img.at(y, x)[0] / 2 + bgr[0] / 2; + img.at(y, x)[1] = img.at(y, x)[1] / 2 + bgr[1] / 2; + img.at(y, x)[2] = img.at(y, x)[2] / 2 + bgr[2] / 2; + } + } + + cv::rectangle(img, r, bgr, 2); + + // Get the size of the text + cv::Size textSize = + cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), + cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL); + // Set the top left corner of the rectangle + cv::Point topLeft(r.x, r.y - textSize.height); + + // Set the bottom right corner of the rectangle + cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height); + + // Set the thickness of the rectangle lines + int lineThickness = 2; + + // Draw the rectangle on the image + cv::rectangle(img, topLeft, bottomRight, bgr, -1); + + cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf), + cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2); + } +} diff --git a/src/postprocess.cu b/src/postprocess.cu new file mode 100644 index 0000000..3cae042 --- /dev/null +++ b/src/postprocess.cu @@ -0,0 +1,84 @@ +// +// Created by lindsay on 23-7-17. +// +#include "types.h" +#include "postprocess.h" + +static __global__ void +decode_kernel(float *predict, int num_bboxes, float confidence_threshold, float *parray, int max_objects) { + float count = predict[0]; + int position = (blockDim.x * blockIdx.x + threadIdx.x); + if (position >= count) return; + + float *pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float)); + int index = atomicAdd(parray, 1); + if (index >= max_objects) return; + + float confidence = pitem[4]; + if (confidence < confidence_threshold) return; + + float left = pitem[0]; + float top = pitem[1]; + float right = pitem[2]; + float bottom = pitem[3]; + float label = pitem[5]; + + float *pout_item = parray + 1 + index * bbox_element; + *pout_item++ = left; + *pout_item++ = top; + *pout_item++ = right; + *pout_item++ = bottom; + *pout_item++ = confidence; + *pout_item++ = label; + *pout_item++ = 1; // 1 = keep, 0 = ignore +} + +static __device__ float +box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom) { + float cleft = max(aleft, bleft); + float ctop = max(atop, btop); + float cright = min(aright, bright); + float cbottom = min(abottom, bbottom); + float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f); + if (c_area == 0.0f) return 0.0f; + + float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop); + float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop); + return c_area / (a_area + b_area - c_area); +} + +static __global__ void nms_kernel(float *bboxes, int max_objects, float threshold) { + int position = (blockDim.x * blockIdx.x + threadIdx.x); + int count = bboxes[0]; + if (position >= count) return; + + float *pcurrent = bboxes + 1 + position * bbox_element; + for (int i = 0; i < count; ++i) { + float *pitem = bboxes + 1 + i * bbox_element; + if (i == position || pcurrent[5] != pitem[5]) continue; + if (pitem[4] >= pcurrent[4]) { + if (pitem[4] == pcurrent[4] && i < position) continue; + float iou = box_iou( + pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3], + pitem[0], pitem[1], pitem[2], pitem[3] + ); + if (iou > threshold) { + pcurrent[6] = 0; + return; + } + } + } +} + +void cuda_decode(float *predict, int num_bboxes, float confidence_threshold, float *parray, int max_objects, + cudaStream_t stream) { + int block = 256; + int grid = ceil(num_bboxes / (float)block); + decode_kernel<<>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects); +} + +void cuda_nms(float *parray, float nms_threshold, int max_objects, cudaStream_t stream) { + int block = max_objects < 256 ? max_objects : 256; + int grid = ceil(max_objects / (float)block); + nms_kernel<<>>(parray, max_objects, nms_threshold); +} diff --git a/src/preprocess.cu b/src/preprocess.cu new file mode 100644 index 0000000..14d9e77 --- /dev/null +++ b/src/preprocess.cu @@ -0,0 +1,155 @@ +#include "preprocess.h" +#include "cuda_utils.h" + +static uint8_t *img_buffer_host = nullptr; +static uint8_t *img_buffer_device = nullptr; + + +__global__ void +warpaffine_kernel(uint8_t *src, int src_line_size, int src_width, int src_height, float *dst, int dst_width, + int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) { + int position = blockDim.x * blockIdx.x + threadIdx.x; + if (position >= edge) return; + + float m_x1 = d2s.value[0]; + float m_y1 = d2s.value[1]; + float m_z1 = d2s.value[2]; + float m_x2 = d2s.value[3]; + float m_y2 = d2s.value[4]; + float m_z2 = d2s.value[5]; + + int dx = position % dst_width; + int dy = position / dst_width; + float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f; + float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f; + float c0, c1, c2; + + if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) { + // out of range + c0 = const_value_st; + c1 = const_value_st; + c2 = const_value_st; + } else { + int y_low = floorf(src_y); + int x_low = floorf(src_x); + int y_high = y_low + 1; + int x_high = x_low + 1; + + uint8_t const_value[] = {const_value_st, const_value_st, const_value_st}; + float ly = src_y - y_low; + float lx = src_x - x_low; + float hy = 1 - ly; + float hx = 1 - lx; + float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + uint8_t *v1 = const_value; + uint8_t *v2 = const_value; + uint8_t *v3 = const_value; + uint8_t *v4 = const_value; + + if (y_low >= 0) { + if (x_low >= 0) + v1 = src + y_low * src_line_size + x_low * 3; + + if (x_high < src_width) + v2 = src + y_low * src_line_size + x_high * 3; + } + + if (y_high < src_height) { + if (x_low >= 0) + v3 = src + y_high * src_line_size + x_low * 3; + + if (x_high < src_width) + v4 = src + y_high * src_line_size + x_high * 3; + } + + c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; + c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]; + c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]; + } + + // bgr to rgb + float t = c2; + c2 = c0; + c0 = t; + + // normalization + c0 = c0 / 255.0f; + c1 = c1 / 255.0f; + c2 = c2 / 255.0f; + + // rgbrgbrgb to rrrgggbbb + int area = dst_width * dst_height; + float *pdst_c0 = dst + dy * dst_width + dx; + float *pdst_c1 = pdst_c0 + area; + float *pdst_c2 = pdst_c1 + area; + *pdst_c0 = c0; + *pdst_c1 = c1; + *pdst_c2 = c2; +} + + + + +void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height, + cudaStream_t stream) { + int img_size = src_width * src_height * 3; + // copy data to pinned memory + memcpy(img_buffer_host, src, img_size); + // copy data to device memory + CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream)); + + AffineMatrix s2d, d2s; + float scale = std::min(dst_height / (float) src_height, dst_width / (float) src_width); + + s2d.value[0] = scale; + s2d.value[1] = 0; + s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5; + s2d.value[3] = 0; + s2d.value[4] = scale; + s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5; + cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value); + cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value); + cv::invertAffineTransform(m2x3_s2d, m2x3_d2s); + + memcpy(d2s.value, m2x3_d2s.ptr(0), sizeof(d2s.value)); + + int jobs = dst_height * dst_width; + int threads = 256; + int blocks = ceil(jobs / (float) threads); + warpaffine_kernel<<>>( + img_buffer_device, src_width * 3, src_width, + src_height, dst, dst_width, + dst_height, 128, d2s, jobs); +} + + +void cuda_batch_preprocess(std::vector &img_batch, + float *dst, int dst_width, int dst_height, + cudaStream_t stream) { + int dst_size = dst_width * dst_height * 3; + for (size_t i = 0; i < img_batch.size(); i++) { + cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width, + dst_height, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } +} + + + + + +void cuda_preprocess_init(int max_image_size) { + // prepare input data in pinned memory + CUDA_CHECK(cudaMallocHost((void **) &img_buffer_host, max_image_size * 3)); + // prepare input data in device memory + CUDA_CHECK(cudaMalloc((void **) &img_buffer_device, max_image_size * 3)); +} + +void cuda_preprocess_destroy() { + CUDA_CHECK(cudaFree(img_buffer_device)); + CUDA_CHECK(cudaFreeHost(img_buffer_host)); +} + + + + diff --git a/yolov8_cls.cpp b/yolov8_cls.cpp new file mode 100644 index 0000000..1ab490b --- /dev/null +++ b/yolov8_cls.cpp @@ -0,0 +1,305 @@ +#include "cuda_utils.h" +#include "logging.h" +#include "utils.h" +#include "model.h" +#include "config.h" +#include "calibrator.h" + +#include +#include +#include +#include +#include + +using namespace nvinfer1; + +static Logger gLogger; +const static int kOutputSize = kClsNumClass; + +void batch_preprocess(std::vector &imgs, float *output, int dst_width = 224, int dst_height = 224) { + for (size_t b = 0; b < imgs.size(); b++) { + int h = imgs[b].rows; + int w = imgs[b].cols; + int m = std::min(h, w); + int top = (h - m) / 2; + int left = (w - m) / 2; + cv::Mat img = imgs[b](cv::Rect(left, top, m, m)); + cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR); + cv::cvtColor(img, img, cv::COLOR_BGR2RGB); + img.convertTo(img, CV_32F, 1 / 255.0); + + std::vector channels(3); + cv::split(img, channels); + + // CHW format + for (int c = 0; c < 3; ++c) { + int i = 0; + for (int row = 0; row < dst_height; ++row) { + for (int col = 0; col < dst_width; ++col) { + output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] = + channels[c].at(row, col); + ++i; + } + } + } + } +} + +std::vector softmax(float *prob, int n) { + std::vector res; + float sum = 0.0f; + float t; + for (int i = 0; i < n; i++) { + t = expf(prob[i]); + res.push_back(t); + sum += t; + } + for (int i = 0; i < n; i++) { + res[i] /= sum; + } + return res; +} + +std::vector topk(const std::vector &vec, int k) { + std::vector topk_index; + std::vector vec_index(vec.size()); + std::iota(vec_index.begin(), vec_index.end(), 0); + + std::sort(vec_index.begin(), vec_index.end(), + [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; }); + + int k_num = std::min(vec.size(), k); + + for (int i = 0; i < k_num; ++i) { + topk_index.push_back(vec_index[i]); + } + + return topk_index; +} + +std::vector read_classes(std::string file_name) { + std::vector classes; + std::ifstream ifs(file_name, std::ios::in); + if (!ifs.is_open()) { + std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl; + assert(0); + } + std::string s; + while (std::getline(ifs, s)) { + classes.push_back(s); + } + ifs.close(); + return classes; +} + +bool +parse_args(int argc, char **argv, std::string &wts, std::string &engine, float &gd, float &gw, std::string &img_dir) { + if (argc < 4) return false; + if (std::string(argv[1]) == "-s" && (argc == 5)) { + wts = std::string(argv[2]); + engine = std::string(argv[3]); + auto net = std::string(argv[4]); + if (net[0] == 'n') { + gd = 0.33; + gw = 0.25; + } else if (net[0] == 's') { + gd = 0.33; + gw = 0.50; + } else if (net[0] == 'm') { + gd = 0.67; + gw = 0.75; + } else if (net[0] == 'l') { + gd = 1.0; + gw = 1.0; + } else if (net[0] == 'x') { + gd = 1.0; + gw = 1.25; + } else { + return false; + } + } else if (std::string(argv[1]) == "-d" && argc == 4) { + engine = std::string(argv[2]); + img_dir = std::string(argv[3]); + } else { + return false; + } + return true; +} + +void prepare_buffers(ICudaEngine *engine, float **gpu_input_buffer, float **gpu_output_buffer, float **cpu_input_buffer, + float **output_buffer_host) { + assert(engine->getNbIOTensors() == 2); + // In order to bind the buffers, we need to know the names of the input and output tensors. + // Note that indices are guaranteed to be less than IEngine::getNbBindings() + TensorIOMode input_mode = engine->getTensorIOMode(kInputTensorName); + if (input_mode != TensorIOMode::kINPUT) { + std::cerr << kInputTensorName << " should be input tensor" << std::endl; + assert(false); + } + TensorIOMode output_mode = engine->getTensorIOMode(kOutputTensorName); + if (output_mode != TensorIOMode::kOUTPUT) { + std::cerr << kOutputTensorName << " should be output tensor" << std::endl; + assert(false); + } + // Create GPU buffers on device + CUDA_CHECK(cudaMalloc((void **) gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float))); + CUDA_CHECK(cudaMalloc((void **) gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float))); + + *cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW]; + *output_buffer_host = new float[kBatchSize * kOutputSize]; +} + +void +infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *input, float *output, int batchSize) { + CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float), + cudaMemcpyHostToDevice, stream)); + context.setInputTensorAddress(kInputTensorName, buffers[0]); + context.setOutputTensorAddress(kOutputTensorName, buffers[1]); + context.enqueueV3(stream); + CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, + stream)); + cudaStreamSynchronize(stream); +} + +void +serialize_engine(unsigned int max_batchsize, float &gd, float &gw, std::string &wts_name, std::string &engine_name) { + // Create builder + IBuilder *builder = createInferBuilder(gLogger); + IBuilderConfig *config = builder->createBuilderConfig(); + // Create model to populate the network, then set the outputs and create an engine + IHostMemory *serialized_engine = nullptr; + //engine = buildEngineYolov8Cls(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name); + serialized_engine = buildEngineYolov8Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw); + assert(serialized_engine); + // Save engine to file + std::ofstream p(engine_name, std::ios::binary); + if (!p) { + std::cerr << "Could not open plan output file" << std::endl; + assert(false); + } + p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); + + // Close everything down + delete serialized_engine; + delete config; + delete builder; +} + +void +deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngine **engine, IExecutionContext **context) { + std::ifstream file(engine_name, std::ios::binary); + if (!file.good()) { + std::cerr << "read " << engine_name << " error!" << std::endl; + assert(false); + } + size_t size = 0; + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + char *serialized_engine = new char[size]; + assert(serialized_engine); + file.read(serialized_engine, size); + file.close(); + + *runtime = createInferRuntime(gLogger); + assert(*runtime); + *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); + assert(*engine); + *context = (*engine)->createExecutionContext(); + assert(*context); + delete[] serialized_engine; +} + +int main(int argc, char **argv) { + // -s ../models/yolov8n-cls.wts ../models/yolov8n-cls.fp32.trt n + // -d ../models/yolov8n-cls.fp32.trt ../images + cudaSetDevice(kGpuId); + + std::string wts_name = ""; + std::string engine_name = ""; + float gd = 0.0f, gw = 0.0f; + std::string img_dir; + + if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) { + std::cerr << "arguments not right!" << std::endl; + std::cerr << "./yolov8_cls -s [.wts] [.engine] [n/s/m/l/x or c gd gw] // serialize model to plan file" + << std::endl; + std::cerr << "./yolov8_cls -d [.engine] ../samples // deserialize plan file and run inference" << std::endl; + return -1; + } + + // Create a model using the API directly and serialize it to a file + if (!wts_name.empty()) { + serialize_engine(kBatchSize, gd, gw, wts_name, engine_name); + return 0; + } + + // Deserialize the engine from file + IRuntime *runtime = nullptr; + ICudaEngine *engine = nullptr; + IExecutionContext *context = nullptr; + deserialize_engine(engine_name, &runtime, &engine, &context); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + // Prepare cpu and gpu buffers + float *device_buffers[2]; + float *cpu_input_buffer = nullptr; + float *output_buffer_host = nullptr; + prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host); + + // Read images from directory + std::vector file_names; + if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { + std::cerr << "read_files_in_dir failed." << std::endl; + return -1; + } + + // Read imagenet labels + auto classes = read_classes("imagenet_classes.txt"); + + // batch predict + for (size_t i = 0; i < file_names.size(); i += kBatchSize) { + // Get a batch of images + std::vector img_batch; + std::vector img_name_batch; + for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { + cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); + img_batch.push_back(img); + img_name_batch.push_back(file_names[j]); + } + + // Preprocess + batch_preprocess(img_batch, cpu_input_buffer); + + // Run inference + auto start = std::chrono::system_clock::now(); + infer(*context, stream, (void **) device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize); + auto end = std::chrono::system_clock::now(); + std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() + << "ms" << std::endl; + + // Postprocess and get top-k result + for (size_t b = 0; b < img_name_batch.size(); b++) { + float *p = &output_buffer_host[b * kOutputSize]; + auto res = softmax(p, kOutputSize); + auto topk_idx = topk(res, 3); + std::cout << img_name_batch[b] << std::endl; + for (auto idx: topk_idx) { + std::cout << " " << classes[idx] << " " << res[idx] << std::endl; + } + } + } + + // Release stream and buffers + cudaStreamDestroy(stream); + CUDA_CHECK(cudaFree(device_buffers[0])); + CUDA_CHECK(cudaFree(device_buffers[1])); + delete[] cpu_input_buffer; + delete[] output_buffer_host; + // Destroy the engine + delete context; + delete engine; + delete runtime; + return 0; +} diff --git a/yolov8_cls_trt.py b/yolov8_cls_trt.py new file mode 100644 index 0000000..514d1f5 --- /dev/null +++ b/yolov8_cls_trt.py @@ -0,0 +1,288 @@ +""" +An example that uses TensorRT's Python api to make inferences. +""" +import os +import shutil +import sys +import threading +import time +import cv2 +import numpy as np +import torch +import pycuda.autoinit # noqa: F401 +import pycuda.driver as cuda +import tensorrt as trt + + +def get_img_path_batches(batch_size, img_dir): + ret = [] + batch = [] + for root, dirs, files in os.walk(img_dir): + for name in files: + if len(batch) == batch_size: + ret.append(batch) + batch = [] + batch.append(os.path.join(root, name)) + if len(batch) > 0: + ret.append(batch) + return ret + + +with open("build/imagenet_classes.txt") as f: + classes = [line.strip() for line in f.readlines()] + + +class YoLov8TRT(object): + """ + description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops. + """ + + def __init__(self, engine_file_path): + # Create a Context on this device, + self.ctx = cuda.Device(0).make_context() + stream = cuda.Stream() + TRT_LOGGER = trt.Logger(trt.Logger.INFO) + runtime = trt.Runtime(TRT_LOGGER) + + # Deserialize the engine from file + with open(engine_file_path, "rb") as f: + engine = runtime.deserialize_cuda_engine(f.read()) + context = engine.create_execution_context() + + host_inputs = [] + cuda_inputs = [] + host_outputs = [] + cuda_outputs = [] + input_binding_names = [] + output_binding_names = [] + + for binding_name in engine: + shape = engine.get_tensor_shape(binding_name) + print('binding_name:', binding_name, shape) + size = trt.volume(shape) + dtype = trt.nptype(engine.get_tensor_dtype(binding_name)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + cuda_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + # Append to the appropriate list. + if engine.get_tensor_mode(binding_name) == trt.TensorIOMode.INPUT: + input_binding_names.append(binding_name) + self.input_w = shape[-1] + self.input_h = shape[-2] + host_inputs.append(host_mem) + cuda_inputs.append(cuda_mem) + elif engine.get_tensor_mode(binding_name) == trt.TensorIOMode.OUTPUT: + output_binding_names.append(binding_name) + host_outputs.append(host_mem) + cuda_outputs.append(cuda_mem) + else: + print('unknow:', binding_name) + + # Store + self.stream = stream + self.context = context + self.engine = engine + self.host_inputs = host_inputs + self.cuda_inputs = cuda_inputs + self.host_outputs = host_outputs + self.cuda_outputs = cuda_outputs + self.input_binding_names = input_binding_names + self.output_binding_names = output_binding_names + self.batch_size = engine.get_tensor_shape(input_binding_names[0])[0] + print('batch_size:', self.batch_size) + + def infer(self, raw_image_generator): + threading.Thread.__init__(self) + # Make self the active context, pushing it on top of the context stack. + self.ctx.push() + # Restore + stream = self.stream + context = self.context + host_inputs = self.host_inputs + cuda_inputs = self.cuda_inputs + host_outputs = self.host_outputs + cuda_outputs = self.cuda_outputs + input_binding_names = self.input_binding_names + output_binding_names = self.output_binding_names + # Do image preprocess + batch_image_raw = [] + batch_input_image = np.empty( + shape=[self.batch_size, 3, self.input_h, self.input_w]) + for i, image_raw in enumerate(raw_image_generator): + batch_image_raw.append(image_raw) + input_image = self.preprocess_cls_image(image_raw) + np.copyto(batch_input_image[i], input_image) + batch_input_image = np.ascontiguousarray(batch_input_image) + + # Copy input image to host buffer + np.copyto(host_inputs[0], batch_input_image.ravel()) + start = time.time() + # Transfer input data to the GPU. + cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) + # Run inference. + context.set_tensor_address(input_binding_names[0], cuda_inputs[0]) + context.set_tensor_address(output_binding_names[0], cuda_outputs[0]) + context.execute_async_v3(stream_handle=stream.handle) + # Transfer predictions back from the GPU. + cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) + # Synchronize the stream + stream.synchronize() + end = time.time() + # Remove any context from the top of the context stack, deactivating it. + self.ctx.pop() + # Here we use the first row of output in that batch_size = 1 + output = host_outputs[0] + # Do postprocess + for i in range(self.batch_size): + classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls( + output) + cv2.putText(batch_image_raw[i], str( + classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA) + print(classes_ls, predicted_conf_ls) + return batch_image_raw, end - start + + def destroy(self): + # Remove any context from the top of the context stack, deactivating it. + self.ctx.pop() + + def get_raw_image(self, image_path_batch): + """ + description: Read an image from image path + """ + for img_path in image_path_batch: + yield cv2.imread(img_path) + + def get_raw_image_zeros(self, image_path_batch=None): + """ + description: Ready data for warmup + """ + for _ in range(self.batch_size): + yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) + + def preprocess_cls_image(self, raw_bgr_image, dst_width=224, dst_height=224): + + """ + description: Convert BGR image to RGB, + crop the center square frame, + resize it to target size, normalize to [0,1], + transform to NCHW format. + param: + raw_bgr_image: numpy array, raw BGR image + dst_width: int, target image width + dst_height: int, target image height + return: + image: the processed image + image_raw: the original image + h: original height + w: original width + """ + image_raw = raw_bgr_image + h, w, c = image_raw.shape + # Crop the center square frame + m = min(h, w) + top = (h - m) // 2 + left = (w - m) // 2 + image = raw_bgr_image[top:top + m, left:left + m] + + # Resize the image with target size while maintaining ratio + image = cv2.resize(image, (dst_width, dst_height), interpolation=cv2.INTER_LINEAR) + + # Convert BGR to RGB + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + # Normalize to [0,1] + image = image.astype(np.float32) / 255.0 + + # HWC to CHW format + image = image.transpose(2, 0, 1) + + # CHW to NCHW format (add batch dimension) + image = np.expand_dims(image, axis=0) + + # Convert the image to row-major order, also known as "C order" + image = np.ascontiguousarray(image) + + batch_data = np.expand_dims(image, axis=0) + + return batch_data + + def postprocess_cls(self, output_data): + classes_ls = [] + predicted_conf_ls = [] + category_id_ls = [] + output_data = output_data.reshape(self.batch_size, -1) + output_data = torch.Tensor(output_data) + p = torch.nn.functional.softmax(output_data, dim=1) + score, index = torch.topk(p, 3) + for ind in range(index.shape[0]): + input_category_id = index[ind][0].item() # 716 + category_id_ls.append(input_category_id) + predicted_confidence = score[ind][0].item() + predicted_conf_ls.append(predicted_confidence) + classes_ls.append(classes[input_category_id]) + return classes_ls, predicted_conf_ls, category_id_ls + + +class inferThread(threading.Thread): + def __init__(self, yolov8_wrapper, image_path_batch): + threading.Thread.__init__(self) + self.yolov8_wrapper = yolov8_wrapper + self.image_path_batch = image_path_batch + + def run(self): + batch_image_raw, use_time = self.yolov8_wrapper.infer( + self.yolov8_wrapper.get_raw_image(self.image_path_batch)) + for i, img_path in enumerate(self.image_path_batch): + parent, filename = os.path.split(img_path) + save_name = os.path.join('output', filename) + # Save image + cv2.imwrite(save_name, batch_image_raw[i]) + print('input->{}, time->{:.2f}ms, saving into output/'.format( + self.image_path_batch, use_time * 1000)) + + +class warmUpThread(threading.Thread): + def __init__(self, yolov8_wrapper): + threading.Thread.__init__(self) + self.yolov8_wrapper = yolov8_wrapper + + def run(self): + batch_image_raw, use_time = self.yolov8_wrapper.infer( + self.yolov8_wrapper.get_raw_image_zeros()) + print( + 'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) + + +if __name__ == "__main__": + # load custom plugin and engine + engine_file_path = "./yolov8x-cls-fp32.engine" + + if len(sys.argv) > 1: + engine_file_path = sys.argv[1] + + if os.path.exists('output/'): + shutil.rmtree('output/') + os.makedirs('output/') + # a YoLov8TRT instance + yolov8_wrapper = YoLov8TRT(engine_file_path) + try: + print('batch size is', yolov8_wrapper.batch_size) + + image_dir = "images/" + image_path_batches = get_img_path_batches( + yolov8_wrapper.batch_size, image_dir) + + for i in range(10): + # create a new thread to do warm_up + thread1 = warmUpThread(yolov8_wrapper) + thread1.start() + thread1.join() + for batch in image_path_batches: + # create a new thread to do inference + thread1 = inferThread(yolov8_wrapper, batch) + thread1.start() + thread1.join() + finally: + # destroy the instance + yolov8_wrapper.destroy() diff --git a/yolov8_det.cpp b/yolov8_det.cpp new file mode 100644 index 0000000..552df96 --- /dev/null +++ b/yolov8_det.cpp @@ -0,0 +1,301 @@ +#include +#include +#include +#include "cuda_utils.h" +#include "logging.h" +#include "model.h" +#include "postprocess.h" +#include "preprocess.h" +#include "utils.h" + +Logger gLogger; +using namespace nvinfer1; +const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1; + +void serialize_engine(std::string &wts_name, std::string &engine_name, int &is_p, std::string &sub_type, float &gd, + float &gw, int &max_channels) { + IBuilder *builder = createInferBuilder(gLogger); + IBuilderConfig *config = builder->createBuilderConfig(); + IHostMemory *serialized_engine = nullptr; + + if (is_p == 6) { + serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + } else if (is_p == 2) { + serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + } else { + serialized_engine = buildEngineYolov8Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + } + + assert(serialized_engine); + std::ofstream p(engine_name, std::ios::binary); + if (!p) { + std::cout << "could not open plan output file" << std::endl; + assert(false); + } + p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); + + delete serialized_engine; + delete config; + delete builder; +} + +void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngine **engine, + IExecutionContext **context) { + std::ifstream file(engine_name, std::ios::binary); + if (!file.good()) { + std::cerr << "read " << engine_name << " error!" << std::endl; + assert(false); + } + size_t size = 0; + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + char *serialized_engine = new char[size]; + assert(serialized_engine); + file.read(serialized_engine, size); + file.close(); + + *runtime = createInferRuntime(gLogger); + assert(*runtime); + *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); + assert(*engine); + *context = (*engine)->createExecutionContext(); + assert(*context); + delete[] serialized_engine; +} + +void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **output_buffer_device, + float **output_buffer_host, float **decode_ptr_host, float **decode_ptr_device, + std::string cuda_post_process) { + assert(engine->getNbIOTensors() == 2); + // In order to bind the buffers, we need to know the names of the input and output tensors. + // Note that indices are guaranteed to be less than IEngine::getNbBindings() + TensorIOMode input_mode = engine->getTensorIOMode(kInputTensorName); + if (input_mode != TensorIOMode::kINPUT) { + std::cerr << kInputTensorName << " should be input tensor" << std::endl; + assert(false); + } + TensorIOMode output_mode = engine->getTensorIOMode(kOutputTensorName); + if (output_mode != TensorIOMode::kOUTPUT) { + std::cerr << kOutputTensorName << " should be output tensor" << std::endl; + assert(false); + } + // Create GPU buffers on device + CUDA_CHECK(cudaMalloc((void **) input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); + CUDA_CHECK(cudaMalloc((void **) output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); + if (cuda_post_process == "c") { + *output_buffer_host = new float[kBatchSize * kOutputSize]; + } else if (cuda_post_process == "g") { + if (kBatchSize > 1) { + std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; + exit(0); + } + // Allocate memory for decode_ptr_host and copy to device + *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; + CUDA_CHECK(cudaMalloc((void **) decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); + } +} + +void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, int batchsize, + float *decode_ptr_host, float *decode_ptr_device, int model_bboxes, std::string cuda_post_process) { + // infer on the batch asynchronously, and DMA output back to host + auto start = std::chrono::system_clock::now(); + context.setInputTensorAddress(kInputTensorName, buffers[0]); + context.setOutputTensorAddress(kOutputTensorName, buffers[1]); + context.enqueueV3(stream); + if (cuda_post_process == "c") { + CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, + stream)); + auto end = std::chrono::system_clock::now(); + std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() + << "ms" << std::endl; + } else if (cuda_post_process == "g") { + CUDA_CHECK( + cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); + cuda_decode((float *) buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); + cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms + CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, + sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, + stream)); + auto end = std::chrono::system_clock::now(); + std::cout << "inference and gpu postprocess time: " + << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + } + + CUDA_CHECK(cudaStreamSynchronize(stream)); +} + +bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, int &is_p, std::string &img_dir, + std::string &sub_type, std::string &cuda_post_process, float &gd, float &gw, int &max_channels) { + if (argc < 4) + return false; + if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { + wts = std::string(argv[2]); + engine = std::string(argv[3]); + auto sub_type = std::string(argv[4]); + + if (sub_type[0] == 'n') { + gd = 0.33; + gw = 0.25; + max_channels = 1024; + } else if (sub_type[0] == 's') { + gd = 0.33; + gw = 0.50; + max_channels = 1024; + } else if (sub_type[0] == 'm') { + gd = 0.67; + gw = 0.75; + max_channels = 576; + } else if (sub_type[0] == 'l') { + gd = 1.0; + gw = 1.0; + max_channels = 512; + } else if (sub_type[0] == 'x') { + gd = 1.0; + gw = 1.25; + max_channels = 640; + } else { + return false; + } + if (sub_type.size() == 2 && sub_type[1] == '6') { + is_p = 6; + } else if (sub_type.size() == 2 && sub_type[1] == '2') { + is_p = 2; + } + } else if (std::string(argv[1]) == "-d" && argc == 5) { + engine = std::string(argv[2]); + img_dir = std::string(argv[3]); + cuda_post_process = std::string(argv[4]); + } else { + return false; + } + return true; +} + +int main(int argc, char **argv) { + // -s ../models/yolov8n.wts ../models/yolov8n.fp32.trt n + // -d ../models/yolov8n.fp32.trt ../images c + cudaSetDevice(kGpuId); + std::string wts_name = ""; + std::string engine_name = ""; + std::string img_dir; + std::string sub_type = ""; + std::string cuda_post_process = ""; + int model_bboxes; + int is_p = 0; + float gd = 0.0f, gw = 0.0f; + int max_channels = 0; + + if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw, + max_channels)) { + std::cerr << "Arguments not right!" << std::endl; + std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to " + "plan file" + << std::endl; + std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl; + return -1; + } + + // Create a model using the API directly and serialize it to a file + if (!wts_name.empty()) { + serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels); + return 0; + } + + // Deserialize the engine from file + IRuntime *runtime = nullptr; + ICudaEngine *engine = nullptr; + IExecutionContext *context = nullptr; + deserialize_engine(engine_name, &runtime, &engine, &context); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + cuda_preprocess_init(kMaxInputImageSize); + auto out_dims = engine->getTensorShape(kOutputTensorName); + model_bboxes = out_dims.d[1]; + // Prepare cpu and gpu buffers + float *device_buffers[2]; + float *output_buffer_host = nullptr; + float *decode_ptr_host = nullptr; + float *decode_ptr_device = nullptr; + + // Read images from directory + std::vector file_names; + if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { + std::cerr << "read_files_in_dir failed." << std::endl; + return -1; + } + + prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, + &decode_ptr_device, cuda_post_process); + + // batch predict + for (size_t i = 0; i < file_names.size(); i += kBatchSize) { + // Get a batch of images + std::vector img_batch; + std::vector img_name_batch; + for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { + cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); + if (img.empty()) { + std::cerr << "Fatal error: image cannot open!" << std::endl; + return -1; + } + img_batch.push_back(img); + img_name_batch.push_back(file_names[j]); + } + // Preprocess + cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); + // Run inference + infer(*context, stream, (void **) device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, + decode_ptr_device, model_bboxes, cuda_post_process); + + std::vector> res_batch; + if (cuda_post_process == "c") { + // NMS + batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); + } else if (cuda_post_process == "g") { + //Process gpu decode and nms results + batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); + } + + // print results + for (size_t j = 0; j < res_batch.size(); j++) { + for (size_t k = 0; k < res_batch[j].size(); k++) { + std::cout << "image: " << img_name_batch[j] << ", bbox: " << res_batch[j][k].bbox[0] << ", " + << res_batch[j][k].bbox[1] << ", " << res_batch[j][k].bbox[2] << ", " + << res_batch[j][k].bbox[3] << ", conf: " << res_batch[j][k].conf << ", class_id: " + << res_batch[j][k].class_id << std::endl; + } + } + + // Draw bounding boxes + draw_bbox(img_batch, res_batch); + // Save images + for (size_t j = 0; j < img_batch.size(); j++) { + cv::imwrite("_" + img_name_batch[j], img_batch[j]); + } + } + + // Release stream and buffers + cudaStreamDestroy(stream); + CUDA_CHECK(cudaFree(device_buffers[0])); + CUDA_CHECK(cudaFree(device_buffers[1])); + CUDA_CHECK(cudaFree(decode_ptr_device)); + delete[] decode_ptr_host; + delete[] output_buffer_host; + cuda_preprocess_destroy(); + // Destroy the engine + delete context; + delete engine; + delete runtime; + + // Print histogram of the output distribution + //std::cout << "\nOutput:\n\n"; + //for (unsigned int i = 0; i < kOutputSize; i++) + //{ + // std::cout << prob[i] << ", "; + // if (i % 10 == 0) std::cout << std::endl; + //} + //std::cout << std::endl; + + return 0; +} diff --git a/yolov8_det_trt.py b/yolov8_det_trt.py new file mode 100644 index 0000000..81546e8 --- /dev/null +++ b/yolov8_det_trt.py @@ -0,0 +1,471 @@ +""" +An example that uses TensorRT's Python api to make inferences. +""" +import ctypes +import os +import shutil +import random +import sys +import threading +import time +import cv2 +import numpy as np +import pycuda.autoinit # noqa: F401 +import pycuda.driver as cuda +import tensorrt as trt + +CONF_THRESH = 0.5 +IOU_THRESHOLD = 0.4 +POSE_NUM = 17 * 3 +DET_NUM = 6 +SEG_NUM = 32 + + +def get_img_path_batches(batch_size, img_dir): + ret = [] + batch = [] + for root, dirs, files in os.walk(img_dir): + for name in files: + if len(batch) == batch_size: + ret.append(batch) + batch = [] + batch.append(os.path.join(root, name)) + if len(batch) > 0: + ret.append(batch) + return ret + + +def plot_one_box(x, img, color=None, label=None, line_thickness=None): + """ + description: Plots one bounding box on image img, + this function comes from YoLov8 project. + param: + x: a box likes [x1,y1,x2,y2] + img: a opencv image object + color: color to draw rectangle, such as (0,255,0) + label: str + line_thickness: int + return: + no return + + """ + tl = ( + line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 + ) # line/font thickness + color = color or [random.randint(0, 255) for _ in range(3)] + c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) + cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) + if label: + tf = max(tl - 1, 1) # font thickness + t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 + cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled + cv2.putText( + img, + label, + (c1[0], c1[1] - 2), + 0, + tl / 3, + [225, 255, 255], + thickness=tf, + lineType=cv2.LINE_AA, + ) + + +class YoLov8TRT(object): + """ + description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops. + """ + + def __init__(self, engine_file_path): + # Create a Context on this device, + self.ctx = cuda.Device(0).make_context() + stream = cuda.Stream() + TRT_LOGGER = trt.Logger(trt.Logger.INFO) + runtime = trt.Runtime(TRT_LOGGER) + + # Deserialize the engine from file + with open(engine_file_path, "rb") as f: + engine = runtime.deserialize_cuda_engine(f.read()) + context = engine.create_execution_context() + + host_inputs = [] + cuda_inputs = [] + host_outputs = [] + cuda_outputs = [] + input_binding_names = [] + output_binding_names = [] + + for binding_name in engine: + shape = engine.get_tensor_shape(binding_name) + print('binding_name:', binding_name, shape) + size = trt.volume(shape) + dtype = trt.nptype(engine.get_tensor_dtype(binding_name)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + cuda_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + # Append to the appropriate list. + if engine.get_tensor_mode(binding_name) == trt.TensorIOMode.INPUT: + input_binding_names.append(binding_name) + self.input_w = shape[-1] + self.input_h = shape[-2] + host_inputs.append(host_mem) + cuda_inputs.append(cuda_mem) + elif engine.get_tensor_mode(binding_name) == trt.TensorIOMode.OUTPUT: + output_binding_names.append(binding_name) + host_outputs.append(host_mem) + cuda_outputs.append(cuda_mem) + else: + print('unknow:', binding_name) + + # Store + self.stream = stream + self.context = context + self.engine = engine + self.host_inputs = host_inputs + self.cuda_inputs = cuda_inputs + self.host_outputs = host_outputs + self.cuda_outputs = cuda_outputs + self.input_binding_names = input_binding_names + self.output_binding_names = output_binding_names + self.batch_size = engine.get_tensor_shape(input_binding_names[0])[0] + print('batch_size:', self.batch_size) + self.det_output_length = host_outputs[0].shape[0] + + def infer(self, raw_image_generator): + threading.Thread.__init__(self) + # Make self the active context, pushing it on top of the context stack. + self.ctx.push() + # Restore + stream = self.stream + context = self.context + host_inputs = self.host_inputs + cuda_inputs = self.cuda_inputs + host_outputs = self.host_outputs + cuda_outputs = self.cuda_outputs + input_binding_names = self.input_binding_names + output_binding_names = self.output_binding_names + # Do image preprocess + batch_image_raw = [] + batch_origin_h = [] + batch_origin_w = [] + batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) + for i, image_raw in enumerate(raw_image_generator): + input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) + batch_image_raw.append(image_raw) + batch_origin_h.append(origin_h) + batch_origin_w.append(origin_w) + np.copyto(batch_input_image[i], input_image) + batch_input_image = np.ascontiguousarray(batch_input_image) + + # Copy input image to host buffer + np.copyto(host_inputs[0], batch_input_image.ravel()) + start = time.time() + # Transfer input data to the GPU. + cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) + # Run inference. + context.set_tensor_address(input_binding_names[0], cuda_inputs[0]) + context.set_tensor_address(output_binding_names[0], cuda_outputs[0]) + context.execute_async_v3(stream_handle=stream.handle) + # Transfer predictions back from the GPU. + cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) + # Synchronize the stream + stream.synchronize() + end = time.time() + # Remove any context from the top of the context stack, deactivating it. + self.ctx.pop() + # Here we use the first row of output in that batch_size = 1 + output = host_outputs[0] + # Do postprocess + for i in range(self.batch_size): + result_boxes, result_scores, result_classid = self.post_process( + output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], + batch_origin_w[i] + ) + # Draw rectangles and labels on the original image + for j in range(len(result_boxes)): + box = result_boxes[j] + plot_one_box( + box, + batch_image_raw[i], + label="{}:{:.2f}".format( + categories[int(result_classid[j])], result_scores[j] + ), + ) + return batch_image_raw, end - start + + def destroy(self): + # Remove any context from the top of the context stack, deactivating it. + self.ctx.pop() + + def get_raw_image(self, image_path_batch): + """ + description: Read an image from image path + """ + for img_path in image_path_batch: + yield cv2.imread(img_path) + + def get_raw_image_zeros(self, image_path_batch=None): + """ + description: Ready data for warmup + """ + for _ in range(self.batch_size): + yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) + + def preprocess_image(self, raw_bgr_image): + """ + description: Convert BGR image to RGB, + resize and pad it to target size, normalize to [0,1], + transform to NCHW format. + param: + input_image_path: str, image path + return: + image: the processed image + image_raw: the original image + h: original height + w: original width + """ + image_raw = raw_bgr_image + h, w, c = image_raw.shape + image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) + # Calculate widht and height and paddings + r_w = self.input_w / w + r_h = self.input_h / h + if r_h > r_w: + tw = self.input_w + th = int(r_w * h) + tx1 = tx2 = 0 + ty1 = int((self.input_h - th) / 2) + ty2 = self.input_h - th - ty1 + else: + tw = int(r_h * w) + th = self.input_h + tx1 = int((self.input_w - tw) / 2) + tx2 = self.input_w - tw - tx1 + ty1 = ty2 = 0 + # Resize the image with long side while maintaining ratio + image = cv2.resize(image, (tw, th)) + # Pad the short side with (128,128,128) + image = cv2.copyMakeBorder( + image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) + ) + image = image.astype(np.float32) + # Normalize to [0,1] + image /= 255.0 + # HWC to CHW format: + image = np.transpose(image, [2, 0, 1]) + # CHW to NCHW format + image = np.expand_dims(image, axis=0) + # Convert the image to row-major order, also known as "C order": + image = np.ascontiguousarray(image) + return image, image_raw, h, w + + def xywh2xyxy(self, origin_h, origin_w, x): + """ + description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right + param: + origin_h: height of original image + origin_w: width of original image + x: A boxes numpy, each row is a box [center_x, center_y, w, h] + return: + y: A boxes numpy, each row is a box [x1, y1, x2, y2] + """ + y = np.zeros_like(x) + r_w = self.input_w / origin_w + r_h = self.input_h / origin_h + if r_h > r_w: + y[:, 0] = x[:, 0] + y[:, 2] = x[:, 2] + y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 + y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 + y /= r_w + else: + y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 + y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 + y[:, 1] = x[:, 1] + y[:, 3] = x[:, 3] + y /= r_h + + return y + + def post_process(self, output, origin_h, origin_w): + """ + description: postprocess the prediction + param: + output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] + origin_h: height of original image + origin_w: width of original image + return: + result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] + result_scores: finally scores, a numpy, each element is the score correspoing to box + result_classid: finally classid, a numpy, each element is the classid correspoing to box + """ + num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + # Get the num of boxes detected + num = int(output[0]) + # Reshape to a two dimentional ndarray + # pred = np.reshape(output[1:], (-1, 38))[:num, :] + pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] + # Do nms + boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) + result_boxes = boxes[:, :4] if len(boxes) else np.array([]) + result_scores = boxes[:, 4] if len(boxes) else np.array([]) + result_classid = boxes[:, 5] if len(boxes) else np.array([]) + return result_boxes, result_scores, result_classid + + def bbox_iou(self, box1, box2, x1y1x2y2=True): + """ + description: compute the IoU of two bounding boxes + param: + box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) + box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) + x1y1x2y2: select the coordinate format + return: + iou: computed iou + """ + if not x1y1x2y2: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + else: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + + # Get the coordinates of the intersection rectangle + inter_rect_x1 = np.maximum(b1_x1, b2_x1) + inter_rect_y1 = np.maximum(b1_y1, b2_y1) + inter_rect_x2 = np.minimum(b1_x2, b2_x2) + inter_rect_y2 = np.minimum(b1_y2, b2_y2) + # Intersection area + inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) + * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) + # Union Area + b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) + b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) + + iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) + + return iou + + def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): + """ + description: Removes detections with lower object confidence score than 'conf_thres' and performs + Non-Maximum Suppression to further filter detections. + param: + prediction: detections, (x1, y1, x2, y2, conf, cls_id) + origin_h: original image height + origin_w: original image width + conf_thres: a confidence threshold to filter detections + nms_thres: a iou threshold to filter detections + return: + boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) + """ + # Get the boxes that score > CONF_THRESH + boxes = prediction[prediction[:, 4] >= conf_thres] + # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] + boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) + # clip the coordinates + boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) + boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) + boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) + boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) + # Object confidence + confs = boxes[:, 4] + # Sort by the confs + boxes = boxes[np.argsort(-confs)] + # Perform non-maximum suppression + keep_boxes = [] + while boxes.shape[0]: + large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres + label_match = boxes[0, -1] == boxes[:, -1] + # Indices of boxes with lower confidence scores, large IOUs and matching labels + invalid = large_overlap & label_match + keep_boxes += [boxes[0]] + boxes = boxes[~invalid] + boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) + return boxes + + +class inferThread(threading.Thread): + def __init__(self, yolov8_wrapper, image_path_batch): + threading.Thread.__init__(self) + self.yolov8_wrapper = yolov8_wrapper + self.image_path_batch = image_path_batch + + def run(self): + batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) + for i, img_path in enumerate(self.image_path_batch): + parent, filename = os.path.split(img_path) + save_name = os.path.join('output', filename) + # Save image + cv2.imwrite(save_name, batch_image_raw[i]) + print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) + + +class warmUpThread(threading.Thread): + def __init__(self, yolov8_wrapper): + threading.Thread.__init__(self) + self.yolov8_wrapper = yolov8_wrapper + + def run(self): + batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) + print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) + + +if __name__ == "__main__": + # load custom plugin and engine + PLUGIN_LIBRARY = "build/libmyplugins.so" + engine_file_path = "yolov8s.engine" + + if len(sys.argv) > 1: + engine_file_path = sys.argv[1] + if len(sys.argv) > 2: + PLUGIN_LIBRARY = sys.argv[2] + + ctypes.CDLL(PLUGIN_LIBRARY) + + # load coco labels + + categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", + "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", + "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", + "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", + "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", + "teddy bear", + "hair drier", "toothbrush"] + + if os.path.exists('output/'): + shutil.rmtree('output/') + os.makedirs('output/') + # a YoLov8TRT instance + yolov8_wrapper = YoLov8TRT(engine_file_path) + try: + print('batch size is', yolov8_wrapper.batch_size) + + image_dir = "images/" + image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) + + for i in range(10): + # create a new thread to do warm_up + thread1 = warmUpThread(yolov8_wrapper) + thread1.start() + thread1.join() + for batch in image_path_batches: + # create a new thread to do inference + thread1 = inferThread(yolov8_wrapper, batch) + thread1.start() + thread1.join() + finally: + # destroy the instance + yolov8_wrapper.destroy() diff --git a/yolov8_pose.cpp b/yolov8_pose.cpp new file mode 100644 index 0000000..da1b8cf --- /dev/null +++ b/yolov8_pose.cpp @@ -0,0 +1,297 @@ + +#include +#include +#include +#include "cuda_utils.h" +#include "logging.h" +#include "model.h" +#include "postprocess.h" +#include "preprocess.h" +#include "utils.h" + +Logger gLogger; +using namespace nvinfer1; +const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 32) / sizeof(float) + 1; + +void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd, + float& gw, int& max_channels) { + IBuilder* builder = createInferBuilder(gLogger); + IBuilderConfig* config = builder->createBuilderConfig(); + IHostMemory* serialized_engine = nullptr; + + if (is_p == 6) { + serialized_engine = buildEngineYolov8PoseP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + } else if (is_p == 2) { + std::cout << "p2 is not supported right now" << std::endl; + } else { + serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + } + + assert(serialized_engine); + std::ofstream p(engine_name, std::ios::binary); + if (!p) { + std::cout << "could not open plan output file" << std::endl; + assert(false); + } + p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); + + delete serialized_engine; + delete config; + delete builder; +} + +void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine, + IExecutionContext** context) { + std::ifstream file(engine_name, std::ios::binary); + if (!file.good()) { + std::cerr << "read " << engine_name << " error!" << std::endl; + assert(false); + } + size_t size = 0; + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + char* serialized_engine = new char[size]; + assert(serialized_engine); + file.read(serialized_engine, size); + file.close(); + + *runtime = createInferRuntime(gLogger); + assert(*runtime); + *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); + assert(*engine); + *context = (*engine)->createExecutionContext(); + assert(*context); + delete[] serialized_engine; +} + +void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device, + float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device, + std::string cuda_post_process) { + assert(engine->getNbIOTensors() == 2); + // In order to bind the buffers, we need to know the names of the input and output tensors. + // Note that indices are guaranteed to be less than IEngine::getNbBindings() + TensorIOMode input_mode = engine->getTensorIOMode(kInputTensorName); + if (input_mode != TensorIOMode::kINPUT) { + std::cerr << kInputTensorName << " should be input tensor" << std::endl; + assert(false); + } + TensorIOMode output_mode = engine->getTensorIOMode(kOutputTensorName); + if (output_mode != TensorIOMode::kOUTPUT) { + std::cerr << kOutputTensorName << " should be output tensor" << std::endl; + assert(false); + } + // Create GPU buffers on device + CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); + CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); + if (cuda_post_process == "c") { + *output_buffer_host = new float[kBatchSize * kOutputSize]; + } else if (cuda_post_process == "g") { + if (kBatchSize > 1) { + std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; + exit(0); + } + // Allocate memory for decode_ptr_host and copy to device + *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; + CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); + } +} + +void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize, + float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) { + // infer on the batch asynchronously, and DMA output back to host + auto start = std::chrono::system_clock::now(); + context.setInputTensorAddress(kInputTensorName, buffers[0]); + context.setOutputTensorAddress(kOutputTensorName, buffers[1]); + context.enqueueV3(stream); + if (cuda_post_process == "c") { + CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, + stream)); + auto end = std::chrono::system_clock::now(); + std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() + << "ms" << std::endl; + } else if (cuda_post_process == "g") { + CUDA_CHECK( + cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); + cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); + cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms + CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, + sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, + stream)); + auto end = std::chrono::system_clock::now(); + std::cout << "inference and gpu postprocess time: " + << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + } + + CUDA_CHECK(cudaStreamSynchronize(stream)); +} + +bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir, + std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) { + if (argc < 4) + return false; + if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) { + wts = std::string(argv[2]); + engine = std::string(argv[3]); + auto sub_type = std::string(argv[4]); + + if (sub_type[0] == 'n') { + gd = 0.33; + gw = 0.25; + max_channels = 1024; + } else if (sub_type[0] == 's') { + gd = 0.33; + gw = 0.50; + max_channels = 1024; + } else if (sub_type[0] == 'm') { + gd = 0.67; + gw = 0.75; + max_channels = 576; + } else if (sub_type[0] == 'l') { + gd = 1.0; + gw = 1.0; + max_channels = 512; + } else if (sub_type[0] == 'x') { + gd = 1.0; + gw = 1.25; + max_channels = 640; + } else { + return false; + } + if (sub_type.size() == 2 && sub_type[1] == '6') { + is_p = 6; + } else if (sub_type.size() == 2 && sub_type[1] == '2') { + is_p = 2; + } + } else if (std::string(argv[1]) == "-d" && argc == 5) { + engine = std::string(argv[2]); + img_dir = std::string(argv[3]); + cuda_post_process = std::string(argv[4]); + } else { + return false; + } + return true; +} + +int main(int argc, char** argv) { + // -s ../models/yolov8n-pose.wts ../models/yolov8n-pose.fp32.trt n + // -d ../models/yolov8n-pose.fp32.trt ../images c + cudaSetDevice(kGpuId); + std::string wts_name = ""; + std::string engine_name = ""; + std::string img_dir; + std::string sub_type = ""; + std::string cuda_post_process = ""; + int model_bboxes; + int is_p = 0; + float gd = 0.0f, gw = 0.0f; + int max_channels = 0; + + if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw, + max_channels)) { + std::cerr << "Arguments not right!" << std::endl; + std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6] // serialize model to " + "plan file" + << std::endl; + std::cerr << "./yolov8 -d [.engine] ../samples [c/g]// deserialize plan file and run inference" << std::endl; + return -1; + } + + // Create a model using the API directly and serialize it to a file + if (!wts_name.empty()) { + serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels); + return 0; + } + + // Deserialize the engine from file + IRuntime* runtime = nullptr; + ICudaEngine* engine = nullptr; + IExecutionContext* context = nullptr; + deserialize_engine(engine_name, &runtime, &engine, &context); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + cuda_preprocess_init(kMaxInputImageSize); + auto out_dims = engine->getTensorShape(kOutputTensorName); + model_bboxes = out_dims.d[1]; + // Prepare cpu and gpu buffers + float* device_buffers[2]; + float* output_buffer_host = nullptr; + float* decode_ptr_host = nullptr; + float* decode_ptr_device = nullptr; + + // Read images from directory + std::vector file_names; + if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { + std::cerr << "read_files_in_dir failed." << std::endl; + return -1; + } + + prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host, + &decode_ptr_device, cuda_post_process); + + // batch predict + for (size_t i = 0; i < file_names.size(); i += kBatchSize) { + // Get a batch of images + std::vector img_batch; + std::vector img_name_batch; + for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { + cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); + img_batch.push_back(img); + img_name_batch.push_back(file_names[j]); + } + // Preprocess + cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); + // Run inference + infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host, + decode_ptr_device, model_bboxes, cuda_post_process); + std::vector> res_batch; + if (cuda_post_process == "c") { + // NMS + batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); + } else if (cuda_post_process == "g") { + // Process gpu decode and nms results + // todo pose in gpu + std::cerr << "pose_postprocess is not support in gpu right now" << std::endl; + } + // Draw bounding boxes + draw_bbox_keypoints_line(img_batch, res_batch); + // Save images + for (size_t j = 0; j < img_batch.size(); j++) { + cv::imwrite("_" + img_name_batch[j], img_batch[j]); + } + + // print results + for (size_t j = 0; j < res_batch.size(); j++) { + for (size_t k = 0; k < res_batch[j].size(); k++) { + std::cout << "image: " << img_name_batch[j] << ", bbox: " << res_batch[j][k].bbox[0] << ", " + << res_batch[j][k].bbox[1] << ", " << res_batch[j][k].bbox[2] << ", " + << res_batch[j][k].bbox[3] << ", conf: " << res_batch[j][k].conf << ", class_id: " + << res_batch[j][k].class_id << std::endl; + } + } + } + + // Release stream and buffers + cudaStreamDestroy(stream); + CUDA_CHECK(cudaFree(device_buffers[0])); + CUDA_CHECK(cudaFree(device_buffers[1])); + CUDA_CHECK(cudaFree(decode_ptr_device)); + delete[] decode_ptr_host; + delete[] output_buffer_host; + cuda_preprocess_destroy(); + // Destroy the engine + delete context; + delete engine; + delete runtime; + + // Print histogram of the output distribution + //std::cout << "\nOutput:\n\n"; + //for (unsigned int i = 0; i < kOutputSize; i++) + //{ + // std::cout << prob[i] << ", "; + // if (i % 10 == 0) std::cout << std::endl; + //} + //std::cout << std::endl; + + return 0; +} diff --git a/yolov8_pose_trt.py b/yolov8_pose_trt.py new file mode 100644 index 0000000..a4a1183 --- /dev/null +++ b/yolov8_pose_trt.py @@ -0,0 +1,511 @@ +""" +An example that uses TensorRT's Python api to make inferences. +""" +import ctypes +import os +import shutil +import random +import sys +import threading +import time +import cv2 +import numpy as np +import pycuda.autoinit # noqa: F401 +import pycuda.driver as cuda +import tensorrt as trt + +CONF_THRESH = 0.5 +IOU_THRESHOLD = 0.4 +POSE_NUM = 17 * 3 +DET_NUM = 6 +SEG_NUM = 32 +keypoint_pairs = [ + (0, 1), (0, 2), (0, 5), (0, 6), (1, 2), + (1, 3), (2, 4), (5, 6), (5, 7), (5, 11), + (6, 8), (6, 12), (7, 9), (8, 10), (11, 12), + (11, 13), (12, 14), (13, 15), (14, 16) +] + + +def get_img_path_batches(batch_size, img_dir): + ret = [] + batch = [] + for root, dirs, files in os.walk(img_dir): + for name in files: + if len(batch) == batch_size: + ret.append(batch) + batch = [] + batch.append(os.path.join(root, name)) + if len(batch) > 0: + ret.append(batch) + return ret + + +def plot_one_box(x, img, color=None, label=None, line_thickness=None): + """ + description: Plots one bounding box on image img, + this function comes from YoLov8 project. + param: + x: a box likes [x1,y1,x2,y2] + img: a opencv image object + color: color to draw rectangle, such as (0,255,0) + label: str + line_thickness: int + return: + no return + + """ + tl = ( + line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 + ) # line/font thickness + color = color or [random.randint(0, 255) for _ in range(3)] + c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) + cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) + if label: + tf = max(tl - 1, 1) # font thickness + t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 + cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled + cv2.putText( + img, + label, + (c1[0], c1[1] - 2), + 0, + tl / 3, + [225, 255, 255], + thickness=tf, + lineType=cv2.LINE_AA, + ) + + +class YoLov8TRT(object): + """ + description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops. + """ + + def __init__(self, engine_file_path): + # Create a Context on this device, + self.ctx = cuda.Device(0).make_context() + stream = cuda.Stream() + TRT_LOGGER = trt.Logger(trt.Logger.INFO) + runtime = trt.Runtime(TRT_LOGGER) + + # Deserialize the engine from file + with open(engine_file_path, "rb") as f: + engine = runtime.deserialize_cuda_engine(f.read()) + context = engine.create_execution_context() + + host_inputs = [] + cuda_inputs = [] + host_outputs = [] + cuda_outputs = [] + input_binding_names = [] + output_binding_names = [] + + for binding_name in engine: + shape = engine.get_tensor_shape(binding_name) + print('binding_name:', binding_name, shape) + size = trt.volume(shape) + dtype = trt.nptype(engine.get_tensor_dtype(binding_name)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + cuda_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + # Append to the appropriate list. + if engine.get_tensor_mode(binding_name) == trt.TensorIOMode.INPUT: + input_binding_names.append(binding_name) + self.input_w = shape[-1] + self.input_h = shape[-2] + host_inputs.append(host_mem) + cuda_inputs.append(cuda_mem) + elif engine.get_tensor_mode(binding_name) == trt.TensorIOMode.OUTPUT: + output_binding_names.append(binding_name) + host_outputs.append(host_mem) + cuda_outputs.append(cuda_mem) + else: + print('unknow:', binding_name) + + # Store + self.stream = stream + self.context = context + self.host_inputs = host_inputs + self.cuda_inputs = cuda_inputs + self.host_outputs = host_outputs + self.cuda_outputs = cuda_outputs + self.input_binding_names = input_binding_names + self.output_binding_names = output_binding_names + self.batch_size = engine.get_tensor_shape(input_binding_names[0])[0] + self.det_output_size = host_outputs[0].shape[0] + print('batch_size:', self.batch_size) + + def infer(self, raw_image_generator): + threading.Thread.__init__(self) + # Make self the active context, pushing it on top of the context stack. + self.ctx.push() + # Restore + stream = self.stream + context = self.context + host_inputs = self.host_inputs + cuda_inputs = self.cuda_inputs + host_outputs = self.host_outputs + cuda_outputs = self.cuda_outputs + input_binding_names = self.input_binding_names + output_binding_names = self.output_binding_names + # Do image preprocess + batch_image_raw = [] + batch_origin_h = [] + batch_origin_w = [] + batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) + for i, image_raw in enumerate(raw_image_generator): + input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) + batch_image_raw.append(image_raw) + batch_origin_h.append(origin_h) + batch_origin_w.append(origin_w) + np.copyto(batch_input_image[i], + input_image) + batch_input_image = np.ascontiguousarray(batch_input_image) + + # Copy input image to host buffer + np.copyto(host_inputs[0], batch_input_image.ravel()) + start = time.time() + # Transfer input data to the GPU. + cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) + # Run inference. + context.set_tensor_address(input_binding_names[0], cuda_inputs[0]) + context.set_tensor_address(output_binding_names[0], cuda_outputs[0]) + context.execute_async_v3(stream_handle=stream.handle) + # Transfer predictions back from the GPU. + cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) + # Synchronize the stream + stream.synchronize() + end = time.time() + # Remove any context from the top of the context stack, deactivating it. + self.ctx.pop() + # Here we use the first row of output in that batch_size = 1 + output = host_outputs[0] + # Do postprocess + for i in range(self.batch_size): + + result_boxes, result_scores, result_classid, keypoints = self.post_process( + output[i * (self.det_output_size): (i + 1) * (self.det_output_size)], + batch_origin_h[i], batch_origin_w[i] + ) + + # Draw rectangles and labels on the original image + for j in range(len(result_boxes)): + box = result_boxes[j] + plot_one_box( + box, + batch_image_raw[i], + label="{}:{:.2f}".format( + categories[int(result_classid[j])], result_scores[j] + ), + ) + + num_keypoints = len(keypoints[j]) // 3 + points = [] + for k in range(num_keypoints): + x = keypoints[j][k * 3] + y = keypoints[j][k * 3 + 1] + confidence = keypoints[j][k * 3 + 2] + if confidence > 0: + points.append((int(x), int(y))) + else: + points.append(None) + + # 根据关键点索引对绘制线条 + for pair in keypoint_pairs: + partA, partB = pair + if points[partA] and points[partB]: + cv2.line(batch_image_raw[i], points[partA], points[partB], (0, 255, 0), 2) + + return batch_image_raw, end - start + + def destroy(self): + # Remove any context from the top of the context stack, deactivating it. + self.ctx.pop() + + def get_raw_image(self, image_path_batch): + """ + description: Read an image from image path + """ + for img_path in image_path_batch: + yield cv2.imread(img_path) + + def get_raw_image_zeros(self, image_path_batch=None): + """ + description: Ready data for warmup + """ + for _ in range(self.batch_size): + yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) + + def preprocess_image(self, raw_bgr_image): + """ + description: Convert BGR image to RGB, + resize and pad it to target size, normalize to [0,1], + transform to NCHW format. + param: + input_image_path: str, image path + return: + image: the processed image + image_raw: the original image + h: original height + w: original width + """ + image_raw = raw_bgr_image + h, w, c = image_raw.shape + image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) + # Calculate widht and height and paddings + r_w = self.input_w / w + r_h = self.input_h / h + if r_h > r_w: + tw = self.input_w + th = int(r_w * h) + tx1 = tx2 = 0 + ty1 = int((self.input_h - th) / 2) + ty2 = self.input_h - th - ty1 + else: + tw = int(r_h * w) + th = self.input_h + tx1 = int((self.input_w - tw) / 2) + tx2 = self.input_w - tw - tx1 + ty1 = ty2 = 0 + # Resize the image with long side while maintaining ratio + image = cv2.resize(image, (tw, th)) + # Pad the short side with (128,128,128) + image = cv2.copyMakeBorder( + image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) + ) + image = image.astype(np.float32) + # Normalize to [0,1] + image /= 255.0 + # HWC to CHW format: + image = np.transpose(image, [2, 0, 1]) + # CHW to NCHW format + image = np.expand_dims(image, axis=0) + # Convert the image to row-major order, also known as "C order": + image = np.ascontiguousarray(image) + return image, image_raw, h, w + + def xywh2xyxy_with_keypoints(self, origin_h, origin_w, boxes, keypoints): + + n = len(boxes) + box_array = np.zeros_like(boxes) + keypoint_array = np.zeros_like(keypoints) + r_w = self.input_w / origin_w + r_h = self.input_h / origin_h + for i in range(n): + if r_h > r_w: + box = boxes[i] + lmk = keypoints[i] + box_array[i, 0] = box[0] / r_w + box_array[i, 2] = box[2] / r_w + box_array[i, 1] = (box[1] - (self.input_h - r_w * origin_h) / 2) / r_w + box_array[i, 3] = (box[3] - (self.input_h - r_w * origin_h) / 2) / r_w + + for j in range(0, len(lmk), 3): + keypoint_array[i, j] = lmk[j] / r_w + keypoint_array[i, j + 1] = (lmk[j + 1] - (self.input_h - r_w * origin_h) / 2) / r_w + keypoint_array[i, j + 2] = lmk[j + 2] + else: + + box = boxes[i] + lmk = keypoints[i] + + box_array[i, 0] = (box[0] - (self.input_w - r_h * origin_w) / 2) / r_h + box_array[i, 2] = (box[2] - (self.input_w - r_h * origin_w) / 2) / r_h + box_array[i, 1] = box[1] / r_h + box_array[i, 3] = box[3] / r_h + + for j in range(0, len(lmk), 3): + keypoint_array[i, j] = (lmk[j] - (self.input_w - r_h * origin_w) / 2) / r_h + keypoint_array[i, j + 1] = lmk[j + 1] / r_h + keypoint_array[i, j + 2] = lmk[j + 2] + + return box_array, keypoint_array + + def post_process(self, output, origin_h, origin_w): + """ + description: Post-process the prediction to include pose keypoints + param: + output: A numpy array like [num_boxes, cx, cy, w, h, conf, + cls_id, px1, py1, pconf1,...px17, py17, pconf17] where p denotes pose keypoint + origin_h: Height of original image + origin_w: Width of original image + return: + result_boxes: Final boxes, a numpy array, each row is a box [x1, y1, x2, y2] + result_scores: Final scores, a numpy array, each element is the score corresponding to box + result_classid: Final classID, a numpy array, each element is the classid corresponding to box + result_keypoints: Final keypoints, a list of numpy arrays, + each element represents keypoints for a box, shaped as (#keypoints, 3) + """ + # Number of values per detection: 38 base values + 17 keypoints * 3 values each + num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM + # Get the number of boxes detected + num = int(output[0]) + # Reshape to a two-dimensional ndarray with the full detection shape + pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :] + + # Perform non-maximum suppression to filter the detections + boxes = self.non_max_suppression( + pred[:, :num_values_per_detection], origin_h, origin_w, + conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) + + # Extract the bounding boxes, confidence scores, and class IDs + result_boxes = boxes[:, :4] if len(boxes) else np.array([]) + result_scores = boxes[:, 4] if len(boxes) else np.array([]) + result_classid = boxes[:, 5] if len(boxes) else np.array([]) + result_keypoints = boxes[:, -POSE_NUM:] if len(boxes) else np.array([]) + + # Return the post-processed results including keypoints + return result_boxes, result_scores, result_classid, result_keypoints + + def bbox_iou(self, box1, box2, x1y1x2y2=True): + """ + description: compute the IoU of two bounding boxes + param: + box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) + box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) + x1y1x2y2: select the coordinate format + return: + iou: computed iou + """ + if not x1y1x2y2: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + else: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + + # Get the coordinates of the intersection rectangle + inter_rect_x1 = np.maximum(b1_x1, b2_x1) + inter_rect_y1 = np.maximum(b1_y1, b2_y1) + inter_rect_x2 = np.minimum(b1_x2, b2_x2) + inter_rect_y2 = np.minimum(b1_y2, b2_y2) + # Intersection area + inter_area = np.clip( + inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None) + # Union Area + b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) + b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) + + iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) + + return iou + + def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): + """ + description: Removes detections with lower object confidence score than 'conf_thres' and performs + Non-Maximum Suppression to further filter detections. + param: + prediction: detections, (x1, y1, x2, y2, conf, cls_id) + origin_h: original image height + origin_w: original image width + conf_thres: a confidence threshold to filter detections + nms_thres: a iou threshold to filter detections + return: + boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) + """ + # Get the boxes that score > CONF_THRESH + boxes = prediction[prediction[:, 4] >= conf_thres] + # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] + res_array = np.copy(boxes) + box_pred_deep_copy = np.copy(boxes[:, :4]) + keypoints_pred_deep_copy = np.copy(boxes[:, -POSE_NUM:]) + res_box, res_keypoints = self.xywh2xyxy_with_keypoints( + origin_h, origin_w, box_pred_deep_copy, keypoints_pred_deep_copy) + res_array[:, :4] = res_box + res_array[:, -POSE_NUM:] = res_keypoints + # clip the coordinates + res_array[:, 0] = np.clip(res_array[:, 0], 0, origin_w - 1) + res_array[:, 2] = np.clip(res_array[:, 2], 0, origin_w - 1) + res_array[:, 1] = np.clip(res_array[:, 1], 0, origin_h - 1) + res_array[:, 3] = np.clip(res_array[:, 3], 0, origin_h - 1) + # Object confidence + confs = res_array[:, 4] + # Sort by the confs + res_array = res_array[np.argsort(-confs)] + # Perform non-maximum suppression + keep_res_array = [] + while res_array.shape[0]: + large_overlap = self.bbox_iou(np.expand_dims(res_array[0, :4], 0), res_array[:, :4]) > nms_thres + label_match = res_array[0, 5] == res_array[:, 5] + invalid = large_overlap & label_match + keep_res_array.append(res_array[0]) + res_array = res_array[~invalid] + + res_array = np.stack(keep_res_array, 0) if len(keep_res_array) else np.array([]) + return res_array + + +class inferThread(threading.Thread): + def __init__(self, yolov8_wrapper, image_path_batch): + threading.Thread.__init__(self) + self.yolov8_wrapper = yolov8_wrapper + self.image_path_batch = image_path_batch + + def run(self): + batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) + for i, img_path in enumerate(self.image_path_batch): + parent, filename = os.path.split(img_path) + save_name = os.path.join('output', filename) + # Save image + + cv2.imwrite(save_name, batch_image_raw[i]) + print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) + + +class warmUpThread(threading.Thread): + def __init__(self, yolov8_wrapper): + threading.Thread.__init__(self) + self.yolov8_wrapper = yolov8_wrapper + + def run(self): + batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) + print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) + + +if __name__ == "__main__": + # load custom plugin and engine + PLUGIN_LIBRARY = "./build/libmyplugins.so" + engine_file_path = "yolov8n-pose.engine" + + if len(sys.argv) > 1: + engine_file_path = sys.argv[1] + if len(sys.argv) > 2: + PLUGIN_LIBRARY = sys.argv[2] + + ctypes.CDLL(PLUGIN_LIBRARY) + + # load coco labels + + categories = ["person"] + + if os.path.exists('output/'): + shutil.rmtree('output/') + os.makedirs('output/') + # a YoLov8TRT instance + yolov8_wrapper = YoLov8TRT(engine_file_path) + try: + print('batch size is', yolov8_wrapper.batch_size) + + image_dir = "images/" + image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) + + for i in range(10): + # create a new thread to do warm_up + thread1 = warmUpThread(yolov8_wrapper) + thread1.start() + thread1.join() + for batch in image_path_batches: + # create a new thread to do inference + thread1 = inferThread(yolov8_wrapper, batch) + thread1.start() + thread1.join() + finally: + # destroy the instance + yolov8_wrapper.destroy() diff --git a/yolov8_seg.cpp b/yolov8_seg.cpp new file mode 100644 index 0000000..dce442f --- /dev/null +++ b/yolov8_seg.cpp @@ -0,0 +1,354 @@ +#include +#include +#include +#include "cuda_utils.h" +#include "logging.h" +#include "model.h" +#include "postprocess.h" +#include "preprocess.h" +#include "utils.h" + +Logger gLogger; +using namespace nvinfer1; +const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 51) / sizeof(float) + 1; +const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4); + +static cv::Rect get_downscale_rect(float bbox[4], float scale) { + + float left = bbox[0]; + float top = bbox[1]; + float right = bbox[0] + bbox[2]; + float bottom = bbox[1] + bbox[3]; + + left = left < 0 ? 0 : left; + top = top < 0 ? 0 : top; + right = right > 640 ? 640 : right; + bottom = bottom > 640 ? 640 : bottom; + + left /= scale; + top /= scale; + right /= scale; + bottom /= scale; + return cv::Rect(int(left), int(top), int(right - left), int(bottom - top)); +} + +std::vector process_mask(const float *proto, int proto_size, std::vector &dets) { + + std::vector masks; + for (size_t i = 0; i < dets.size(); i++) { + + cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1); + auto r = get_downscale_rect(dets[i].bbox, 4); + + for (int x = r.x; x < r.x + r.width; x++) { + for (int y = r.y; y < r.y + r.height; y++) { + float e = 0.0f; + for (int j = 0; j < 32; j++) { + e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x]; + } + e = 1.0f / (1.0f + expf(-e)); + mask_mat.at(y, x) = e; + } + } + cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH)); + masks.push_back(mask_mat); + } + return masks; +} + +void serialize_engine(std::string &wts_name, std::string &engine_name, std::string &sub_type, float &gd, float &gw, + int &max_channels) { + IBuilder *builder = createInferBuilder(gLogger); + IBuilderConfig *config = builder->createBuilderConfig(); + IHostMemory *serialized_engine = nullptr; + + serialized_engine = buildEngineYolov8Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels); + + assert(serialized_engine); + std::ofstream p(engine_name, std::ios::binary); + if (!p) { + std::cout << "could not open plan output file" << std::endl; + assert(false); + } + p.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); + + delete serialized_engine; + delete config; + delete builder; +} + +void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngine **engine, + IExecutionContext **context) { + std::ifstream file(engine_name, std::ios::binary); + if (!file.good()) { + std::cerr << "read " << engine_name << " error!" << std::endl; + assert(false); + } + size_t size = 0; + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + char *serialized_engine = new char[size]; + assert(serialized_engine); + file.read(serialized_engine, size); + file.close(); + + *runtime = createInferRuntime(gLogger); + assert(*runtime); + *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size); + assert(*engine); + *context = (*engine)->createExecutionContext(); + assert(*context); + delete[] serialized_engine; +} + +void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **output_buffer_device, + float **output_seg_buffer_device, float **output_buffer_host, float **output_seg_buffer_host, + float **decode_ptr_host, float **decode_ptr_device, std::string cuda_post_process) { + assert(engine->getNbIOTensors() == 3); + // In order to bind the buffers, we need to know the names of the input and output tensors. + // Note that indices are guaranteed to be less than IEngine::getNbBindings() + TensorIOMode input_mode = engine->getTensorIOMode(kInputTensorName); + if (input_mode != TensorIOMode::kINPUT) { + std::cerr << kInputTensorName << " should be input tensor" << std::endl; + assert(false); + } + TensorIOMode output_mode = engine->getTensorIOMode(kOutputTensorName); + if (output_mode != TensorIOMode::kOUTPUT) { + std::cerr << kOutputTensorName << " should be output tensor" << std::endl; + assert(false); + } + TensorIOMode proto_mode = engine->getTensorIOMode(kProtoTensorName); + if (proto_mode != TensorIOMode::kOUTPUT) { + std::cerr << kProtoTensorName << " should be output tensor" << std::endl; + assert(false); + } + // Create GPU buffers on device + CUDA_CHECK(cudaMalloc((void **) input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float))); + CUDA_CHECK(cudaMalloc((void **) output_buffer_device, kBatchSize * kOutputSize * sizeof(float))); + CUDA_CHECK(cudaMalloc((void **) output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float))); + + if (cuda_post_process == "c") { + *output_buffer_host = new float[kBatchSize * kOutputSize]; + *output_seg_buffer_host = new float[kBatchSize * kOutputSegSize]; + } else if (cuda_post_process == "g") { + if (kBatchSize > 1) { + std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl; + exit(0); + } + // Allocate memory for decode_ptr_host and copy to device + *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element]; + CUDA_CHECK(cudaMalloc((void **) decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element))); + } +} + +void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, float *output_seg, + int batchsize, float *decode_ptr_host, float *decode_ptr_device, int model_bboxes, + std::string cuda_post_process) { + // infer on the batch asynchronously, and DMA output back to host + auto start = std::chrono::system_clock::now(); + context.setInputTensorAddress(kInputTensorName, buffers[0]); + context.setOutputTensorAddress(kOutputTensorName, buffers[1]); + context.setOutputTensorAddress(kProtoTensorName, buffers[2]); + context.enqueueV3(stream); + if (cuda_post_process == "c") { + + std::cout << "kOutputSize:" << kOutputSize << std::endl; + CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost, + stream)); + std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl; + CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float), + cudaMemcpyDeviceToHost, stream)); + + auto end = std::chrono::system_clock::now(); + std::cout << "inference time: " << std::chrono::duration_cast(end - start).count() + << "ms" << std::endl; + } else if (cuda_post_process == "g") { + CUDA_CHECK( + cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream)); + cuda_decode((float *) buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream); + cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream); //cuda nms + CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, + sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost, + stream)); + auto end = std::chrono::system_clock::now(); + std::cout << "inference and gpu postprocess time: " + << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + } + + CUDA_CHECK(cudaStreamSynchronize(stream)); +} + +bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, std::string &img_dir, + std::string &sub_type, std::string &cuda_post_process, std::string &labels_filename, float &gd, + float &gw, int &max_channels) { + if (argc < 4) + return false; + if (std::string(argv[1]) == "-s" && argc == 5) { + wts = std::string(argv[2]); + engine = std::string(argv[3]); + sub_type = std::string(argv[4]); + if (sub_type == "n") { + gd = 0.33; + gw = 0.25; + max_channels = 1024; + } else if (sub_type == "s") { + gd = 0.33; + gw = 0.50; + max_channels = 1024; + } else if (sub_type == "m") { + gd = 0.67; + gw = 0.75; + max_channels = 576; + } else if (sub_type == "l") { + gd = 1.0; + gw = 1.0; + max_channels = 512; + } else if (sub_type == "x") { + gd = 1.0; + gw = 1.25; + max_channels = 640; + } else { + return false; + } + } else if (std::string(argv[1]) == "-d" && argc == 6) { + engine = std::string(argv[2]); + img_dir = std::string(argv[3]); + cuda_post_process = std::string(argv[4]); + labels_filename = std::string(argv[5]); + } else { + return false; + } + return true; +} + +int main(int argc, char **argv) { + // -s ../models/yolov8n-seg.wts ../models/yolov8n-seg.fp32.trt n + // -d ../models/yolov8n-seg.fp32.trt ../images c coco.txt + cudaSetDevice(kGpuId); + std::string wts_name = ""; + std::string engine_name = ""; + std::string img_dir; + std::string sub_type = ""; + std::string cuda_post_process = ""; + std::string labels_filename = "../coco.txt"; + int model_bboxes; + float gd = 0.0f, gw = 0.0f; + int max_channels = 0; + + if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process, labels_filename, gd, gw, + max_channels)) { + std::cerr << "Arguments not right!" << std::endl; + std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x] // serialize model to plan file" << std::endl; + std::cerr << "./yolov8 -d [.engine] ../samples [c/g] coco_file// deserialize plan file and run inference" + << std::endl; + return -1; + } + + // Create a model using the API directly and serialize it to a file + if (!wts_name.empty()) { + serialize_engine(wts_name, engine_name, sub_type, gd, gw, max_channels); + return 0; + } + + // Deserialize the engine from file + IRuntime *runtime = nullptr; + ICudaEngine *engine = nullptr; + IExecutionContext *context = nullptr; + deserialize_engine(engine_name, &runtime, &engine, &context); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + cuda_preprocess_init(kMaxInputImageSize); + auto out_dims = engine->getTensorShape(kOutputTensorName); + model_bboxes = out_dims.d[1]; + // Prepare cpu and gpu buffers + float *device_buffers[3]; + float *output_buffer_host = nullptr; + float *output_seg_buffer_host = nullptr; + float *decode_ptr_host = nullptr; + float *decode_ptr_device = nullptr; + + // Read images from directory + std::vector file_names; + if (read_files_in_dir(img_dir.c_str(), file_names) < 0) { + std::cerr << "read_files_in_dir failed." << std::endl; + return -1; + } + + std::unordered_map labels_map; + read_labels(labels_filename, labels_map); + assert(kNumClass == labels_map.size()); + + prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host, + &output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process); + + // // batch predict + for (size_t i = 0; i < file_names.size(); i += kBatchSize) { + // Get a batch of images + std::vector img_batch; + std::vector img_name_batch; + for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) { + cv::Mat img = cv::imread(img_dir + "/" + file_names[j]); + img_batch.push_back(img); + img_name_batch.push_back(file_names[j]); + } + // Preprocess + cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream); + // Run inference + infer(*context, stream, (void **) device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize, + decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process); + std::vector > res_batch; + if (cuda_post_process == "c") { + // NMS + batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh); + for (size_t b = 0; b < img_batch.size(); b++) { + auto &res = res_batch[b]; + cv::Mat img = img_batch[b]; + auto masks = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res); + draw_mask_bbox(img, res, masks, labels_map); + cv::imwrite("_" + img_name_batch[b], img); + } + } else if (cuda_post_process == "g") { + // Process gpu decode and nms results + // batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch); + // todo seg in gpu + std::cerr << "seg_postprocess is not support in gpu right now" << std::endl; + } + + // print results + for (size_t j = 0; j < res_batch.size(); j++) { + for (size_t k = 0; k < res_batch[j].size(); k++) { + std::cout << "image: " << img_name_batch[j] << ", bbox: " << res_batch[j][k].bbox[0] << ", " + << res_batch[j][k].bbox[1] << ", " << res_batch[j][k].bbox[2] << ", " + << res_batch[j][k].bbox[3] << ", conf: " << res_batch[j][k].conf << ", class_id: " + << res_batch[j][k].class_id << std::endl; + } + } + } + + // Release stream and buffers + cudaStreamDestroy(stream); + CUDA_CHECK(cudaFree(device_buffers[0])); + CUDA_CHECK(cudaFree(device_buffers[1])); + CUDA_CHECK(cudaFree(device_buffers[2])); + CUDA_CHECK(cudaFree(decode_ptr_device)); + delete[] decode_ptr_host; + delete[] output_buffer_host; + delete[] output_seg_buffer_host; + cuda_preprocess_destroy(); + // Destroy the engine + delete context; + delete engine; + delete runtime; + + // Print histogram of the output distribution + // std::cout << "\nOutput:\n\n"; + // for (unsigned int i = 0; i < kOutputSize; i++) + //{ + // std::cout << prob[i] << ", "; + // if (i % 10 == 0) std::cout << std::endl; + //} + // std::cout << std::endl; + + return 0; +} diff --git a/yolov8_seg_trt.py b/yolov8_seg_trt.py new file mode 100644 index 0000000..e3ed9d6 --- /dev/null +++ b/yolov8_seg_trt.py @@ -0,0 +1,590 @@ +""" +An example that uses TensorRT's Python api to make inferences. +""" +import ctypes +import os +import shutil +import random +import sys +import threading +import time +import cv2 +import numpy as np +import pycuda.autoinit # noqa: F401 +import pycuda.driver as cuda +import tensorrt as trt + +CONF_THRESH = 0.5 +IOU_THRESHOLD = 0.4 +POSE_NUM = 17 * 3 +DET_NUM = 6 +SEG_NUM = 32 + + +def get_img_path_batches(batch_size, img_dir): + ret = [] + batch = [] + for root, dirs, files in os.walk(img_dir): + for name in files: + if len(batch) == batch_size: + ret.append(batch) + batch = [] + batch.append(os.path.join(root, name)) + if len(batch) > 0: + ret.append(batch) + return ret + + +def plot_one_box(x, img, color=None, label=None, line_thickness=None): + """ + description: Plots one bounding box on image img, + this function comes from YoLov8 project. + param: + x: a box likes [x1,y1,x2,y2] + img: a opencv image object + color: color to draw rectangle, such as (0,255,0) + label: str + line_thickness: int + return: + no return + + """ + tl = ( + line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 + ) # line/font thickness + color = color or [random.randint(0, 255) for _ in range(3)] + c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) + cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) + if label: + tf = max(tl - 1, 1) # font thickness + t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 + cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled + cv2.putText( + img, + label, + (c1[0], c1[1] - 2), + 0, + tl / 3, + [225, 255, 255], + thickness=tf, + lineType=cv2.LINE_AA, + ) + + +class YoLov8TRT(object): + """ + description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops. + """ + + def __init__(self, engine_file_path): + # Create a Context on this device, + self.ctx = cuda.Device(0).make_context() + stream = cuda.Stream() + TRT_LOGGER = trt.Logger(trt.Logger.INFO) + runtime = trt.Runtime(TRT_LOGGER) + + # Deserialize the engine from file + with open(engine_file_path, "rb") as f: + engine = runtime.deserialize_cuda_engine(f.read()) + context = engine.create_execution_context() + + host_inputs = [] + cuda_inputs = [] + host_outputs = [] + cuda_outputs = [] + input_binding_names = [] + output_binding_names = [] + + for binding_name in engine: + shape = engine.get_tensor_shape(binding_name) + print('binding_name:', binding_name, shape) + size = trt.volume(shape) + dtype = trt.nptype(engine.get_tensor_dtype(binding_name)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + cuda_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + # Append to the appropriate list. + if engine.get_tensor_mode(binding_name) == trt.TensorIOMode.INPUT: + input_binding_names.append(binding_name) + self.input_w = shape[-1] + self.input_h = shape[-2] + host_inputs.append(host_mem) + cuda_inputs.append(cuda_mem) + elif engine.get_tensor_mode(binding_name) == trt.TensorIOMode.OUTPUT: + output_binding_names.append(binding_name) + host_outputs.append(host_mem) + cuda_outputs.append(cuda_mem) + else: + print('unknow:', binding_name) + + # Store + self.stream = stream + self.context = context + self.engine = engine + self.host_inputs = host_inputs + self.cuda_inputs = cuda_inputs + self.host_outputs = host_outputs + self.cuda_outputs = cuda_outputs + self.input_binding_names = input_binding_names + self.output_binding_names = output_binding_names + self.batch_size = engine.get_tensor_shape(input_binding_names[0])[0] + print('batch_size:', self.batch_size) + + # Data length + self.det_output_length = host_outputs[0].shape[0] + self.seg_output_length = host_outputs[1].shape[0] + self.seg_w = int(self.input_w / 4) + self.seg_h = int(self.input_h / 4) + self.seg_c = int(self.seg_output_length / (self.seg_w * self.seg_w)) + self.det_row_output_length = self.seg_c + DET_NUM + POSE_NUM + + # Draw mask + self.colors_obj = Colors() + + def infer(self, raw_image_generator): + threading.Thread.__init__(self) + # Make self the active context, pushing it on top of the context stack. + self.ctx.push() + # Restore + stream = self.stream + context = self.context + host_inputs = self.host_inputs + cuda_inputs = self.cuda_inputs + host_outputs = self.host_outputs + cuda_outputs = self.cuda_outputs + input_binding_names = self.input_binding_names + output_binding_names = self.output_binding_names + # Do image preprocess + batch_image_raw = [] + batch_origin_h = [] + batch_origin_w = [] + batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w]) + for i, image_raw in enumerate(raw_image_generator): + input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw) + batch_image_raw.append(image_raw) + batch_origin_h.append(origin_h) + batch_origin_w.append(origin_w) + np.copyto(batch_input_image[i], input_image) + batch_input_image = np.ascontiguousarray(batch_input_image) + + # Copy input image to host buffer + np.copyto(host_inputs[0], batch_input_image.ravel()) + start = time.time() + # Transfer input data to the GPU. + cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) + # Run inference. + context.set_tensor_address(input_binding_names[0], cuda_inputs[0]) + context.set_tensor_address(output_binding_names[0], cuda_outputs[0]) + context.set_tensor_address(output_binding_names[1], cuda_outputs[1]) + context.execute_async_v3(stream_handle=stream.handle) + # Transfer predictions back from the GPU. + cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) + cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream) + + # Synchronize the stream + stream.synchronize() + end = time.time() + # Remove any context from the top of the context stack, deactivating it. + self.ctx.pop() + # Here we use the first row of output in that batch_size = 1 + output = host_outputs[0] + output_proto_mask = host_outputs[1] + # Do postprocess + for i in range(self.batch_size): + result_boxes, result_scores, result_classid, result_proto_coef = self.post_process( + output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i], + batch_origin_w[i] + ) + + if result_proto_coef.shape[0] == 0: + continue + result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i], + batch_origin_w[i]) + + self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid], + im_src=batch_image_raw[i]) + + # Draw rectangles and labels on the original image + for j in range(len(result_boxes)): + box = result_boxes[j] + plot_one_box( + box, + batch_image_raw[i], + label="{}:{:.2f}".format( + categories[int(result_classid[j])], result_scores[j] + ), + ) + return batch_image_raw, end - start + + def destroy(self): + # Remove any context from the top of the context stack, deactivating it. + self.ctx.pop() + + def get_raw_image(self, image_path_batch): + """ + description: Read an image from image path + """ + for img_path in image_path_batch: + yield cv2.imread(img_path) + + def get_raw_image_zeros(self, image_path_batch=None): + """ + description: Ready data for warmup + """ + for _ in range(self.batch_size): + yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8) + + def preprocess_image(self, raw_bgr_image): + """ + description: Convert BGR image to RGB, + resize and pad it to target size, normalize to [0,1], + transform to NCHW format. + param: + input_image_path: str, image path + return: + image: the processed image + image_raw: the original image + h: original height + w: original width + """ + image_raw = raw_bgr_image + h, w, c = image_raw.shape + image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB) + # Calculate widht and height and paddings + r_w = self.input_w / w + r_h = self.input_h / h + if r_h > r_w: + tw = self.input_w + th = int(r_w * h) + tx1 = tx2 = 0 + ty1 = int((self.input_h - th) / 2) + ty2 = self.input_h - th - ty1 + else: + tw = int(r_h * w) + th = self.input_h + tx1 = int((self.input_w - tw) / 2) + tx2 = self.input_w - tw - tx1 + ty1 = ty2 = 0 + # Resize the image with long side while maintaining ratio + image = cv2.resize(image, (tw, th)) + # Pad the short side with (128,128,128) + image = cv2.copyMakeBorder( + image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128) + ) + image = image.astype(np.float32) + # Normalize to [0,1] + image /= 255.0 + # HWC to CHW format: + image = np.transpose(image, [2, 0, 1]) + # CHW to NCHW format + image = np.expand_dims(image, axis=0) + # Convert the image to row-major order, also known as "C order": + image = np.ascontiguousarray(image) + return image, image_raw, h, w + + def xywh2xyxy(self, origin_h, origin_w, x): + """ + description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right + param: + origin_h: height of original image + origin_w: width of original image + x: A boxes numpy, each row is a box [center_x, center_y, w, h] + return: + y: A boxes numpy, each row is a box [x1, y1, x2, y2] + """ + y = np.zeros_like(x) + r_w = self.input_w / origin_w + r_h = self.input_h / origin_h + if r_h > r_w: + y[:, 0] = x[:, 0] + y[:, 2] = x[:, 2] + y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2 + y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2 + y /= r_w + else: + y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2 + y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2 + y[:, 1] = x[:, 1] + y[:, 3] = x[:, 3] + y /= r_h + + return y + + def post_process(self, output, origin_h, origin_w): + """ + description: postprocess the prediction + param: + output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] + origin_h: height of original image + origin_w: width of original image + return: + result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2] + result_scores: finally scores, a numpy, each element is the score correspoing to box + result_classid: finally classid, a numpy, each element is the classid correspoing to box + """ + # Get the num of boxes detected + num = int(output[0]) + # Reshape to a two dimentional ndarray + pred = np.reshape(output[1:], (-1, self.det_row_output_length))[:num, :] + + # Do nms + boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD) + result_boxes = boxes[:, :4] if len(boxes) else np.array([]) + result_scores = boxes[:, 4] if len(boxes) else np.array([]) + result_classid = boxes[:, 5] if len(boxes) else np.array([]) + result_proto_coef = boxes[:, DET_NUM:int(DET_NUM + SEG_NUM)] if len(boxes) else np.array([]) + return result_boxes, result_scores, result_classid, result_proto_coef + + def bbox_iou(self, box1, box2, x1y1x2y2=True): + """ + description: compute the IoU of two bounding boxes + param: + box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) + box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h)) + x1y1x2y2: select the coordinate format + return: + iou: computed iou + """ + if not x1y1x2y2: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + else: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + + # Get the coordinates of the intersection rectangle + inter_rect_x1 = np.maximum(b1_x1, b2_x1) + inter_rect_y1 = np.maximum(b1_y1, b2_y1) + inter_rect_x2 = np.minimum(b1_x2, b2_x2) + inter_rect_y2 = np.minimum(b1_y2, b2_y2) + # Intersection area + inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) + * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)) + # Union Area + b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) + b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) + + iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) + + return iou + + def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4): + """ + description: Removes detections with lower object confidence score than 'conf_thres' and performs + Non-Maximum Suppression to further filter detections. + param: + prediction: detections, (x1, y1, x2, y2, conf, cls_id) + origin_h: original image height + origin_w: original image width + conf_thres: a confidence threshold to filter detections + nms_thres: a iou threshold to filter detections + return: + boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id) + """ + # Get the boxes that score > CONF_THRESH + boxes = prediction[prediction[:, 4] >= conf_thres] + # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2] + boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4]) + # clip the coordinates + boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1) + boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1) + boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1) + boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1) + # Object confidence + confs = boxes[:, 4] + # Sort by the confs + boxes = boxes[np.argsort(-confs)] + # Perform non-maximum suppression + keep_boxes = [] + while boxes.shape[0]: + large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres + label_match = boxes[0, 5] == boxes[:, 5] + # Indices of boxes with lower confidence scores, large IOUs and matching labels + invalid = large_overlap & label_match + keep_boxes += [boxes[0]] + boxes = boxes[~invalid] + boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([]) + return boxes + + def sigmoid(self, x): + return 1 / (1 + np.exp(-x)) + + def scale_mask(self, mask, ih, iw): + mask = cv2.resize(mask, (self.input_w, self.input_h)) + r_w = self.input_w / (iw * 1.0) + r_h = self.input_h / (ih * 1.0) + if r_h > r_w: + w = self.input_w + h = int(r_w * ih) + x = 0 + y = int((self.input_h - h) / 2) + else: + w = int(r_h * iw) + h = self.input_h + x = int((self.input_w - w) / 2) + y = 0 + crop = mask[y:y + h, x:x + w] + crop = cv2.resize(crop, (iw, ih)) + return crop + + def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw): + """ + description: Mask pred by yolov8 instance segmentation , + param: + output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input + result_proto_coef: prototype mask coefficients (n, 32), n represents n results + result_boxes : + ih: rows of original image + iw: cols of original image + return: + mask_result: (n, ih, iw) + """ + result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w) + c, mh, mw = result_proto_masks.shape + print(result_proto_masks.shape) + print(result_proto_coef.shape) + masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh, + mw) + + mask_result = [] + for mask, box in zip(masks, result_boxes): + mask_s = np.zeros((ih, iw)) + crop_mask = self.scale_mask(mask, ih, iw) + x1 = int(box[0]) + y1 = int(box[1]) + x2 = int(box[2]) + y2 = int(box[3]) + crop = crop_mask[y1:y2, x1:x2] + crop = np.where(crop >= 0.5, 1, 0) + crop = crop.astype(np.uint8) + mask_s[y1:y2, x1:x2] = crop + + mask_result.append(mask_s) + mask_result = np.array(mask_result) + return mask_result + + def draw_mask(self, masks, colors_, im_src, alpha=0.5): + """ + description: Draw mask on image , + param: + masks : result_mask + colors_: color to draw mask + im_src : original image + alpha : scale between original image and mask + return: + no return + """ + if len(masks) == 0: + return + masks = np.asarray(masks, dtype=np.uint8) + masks = np.ascontiguousarray(masks.transpose(1, 2, 0)) + masks = np.asarray(masks, dtype=np.float32) + colors_ = np.asarray(colors_, dtype=np.float32) + s = masks.sum(2, keepdims=True).clip(0, 1) + masks = (masks @ colors_).clip(0, 255) + im_src[:] = masks * alpha + im_src * (1 - s * alpha) + + +class inferThread(threading.Thread): + def __init__(self, yolov8_wrapper, image_path_batch): + threading.Thread.__init__(self) + self.yolov8_wrapper = yolov8_wrapper + self.image_path_batch = image_path_batch + + def run(self): + batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch)) + for i, img_path in enumerate(self.image_path_batch): + parent, filename = os.path.split(img_path) + save_name = os.path.join('output', filename) + # Save image + cv2.imwrite(save_name, batch_image_raw[i]) + print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000)) + + +class warmUpThread(threading.Thread): + def __init__(self, yolov8_wrapper): + threading.Thread.__init__(self) + self.yolov8_wrapper = yolov8_wrapper + + def run(self): + batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros()) + print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000)) + + +class Colors: + def __init__(self): + hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', + '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF', + '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', + 'FF95C8', 'FF37C7') + self.palette = [self.hex2rgb(f'#{c}') for c in hexs] + self.n = len(self.palette) + + def __call__(self, i, bgr=False): + c = self.palette[int(i) % self.n] + return (c[2], c[1], c[0]) if bgr else c + + @staticmethod + def hex2rgb(h): # rgb order (PIL) + return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4)) + + +if __name__ == "__main__": + # load custom plugin and engine + PLUGIN_LIBRARY = "build/libmyplugins.so" + engine_file_path = "yolov8n-seg.engine" + + if len(sys.argv) > 1: + engine_file_path = sys.argv[1] + if len(sys.argv) > 2: + PLUGIN_LIBRARY = sys.argv[2] + + ctypes.CDLL(PLUGIN_LIBRARY) + + # load coco labels + + categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", + "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", + "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", + "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", + "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", + "teddy bear", + "hair drier", "toothbrush"] + + if os.path.exists('output/'): + shutil.rmtree('output/') + os.makedirs('output/') + # a YoLov8TRT instance + yolov8_wrapper = YoLov8TRT(engine_file_path) + try: + print('batch size is', yolov8_wrapper.batch_size) + + image_dir = "images/" + image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir) + + for i in range(10): + # create a new thread to do warm_up + thread1 = warmUpThread(yolov8_wrapper) + thread1.start() + thread1.join() + for batch in image_path_batches: + # create a new thread to do inference + thread1 = inferThread(yolov8_wrapper, batch) + thread1.start() + thread1.join() + finally: + # destroy the instance + yolov8_wrapper.destroy()