diff --git a/.gitignore b/.gitignore
index 259148f..3140741 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,9 @@
 *.exe
 *.out
 *.app
+
+build/
+models/
+cmake-build-debug/
+cmake-build-release/
+.idea/
diff --git a/CMakeLists-win.txt b/CMakeLists-win.txt
new file mode 100644
index 0000000..ea732c6
--- /dev/null
+++ b/CMakeLists-win.txt
@@ -0,0 +1,103 @@
+cmake_minimum_required(VERSION 3.28)
+project(yolov8_trtx_v10)
+
+set(CMAKE_CXX_STANDARD 11)
+# 设置nvcc编译cu文件时候使用utf-8编码
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /utf-8")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
+
+enable_language(CUDA)
+
+# 设置cuda多个框架支持
+set(CMAKE_CUDA_ARCHITECTURES 75 86 89)
+message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
+
+# OpenCV
+set(OpenCV_DIR E:\\Opencv\\install\\opencv-4.8.0\\build)
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS})
+link_directories(${OpenCV_LIB_DIR})
+
+# CUDA
+set(CUDA_TOOLKIT_ROOT_DIR C:\\Program\ Files\\NVIDIA\ GPU\ Computing\ Toolkit\\CUDA\\v11.8)
+include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include)
+link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
+
+# TensorRT
+#set(TENSORRT_ROOT E:\\TensorRT\\TensorRT-8.6.1.6)
+set(TENSORRT_ROOT E:\\TensorRT\\TensorRT-10.2.0.19)
+include_directories(${TENSORRT_ROOT}/include)
+link_directories(${TENSORRT_ROOT}/lib)
+
+# 判断TENSORRT_ROOT路径中的version如果路径中第一个.前大于8
+# 获取所有版本文件
+file(GLOB TENSORRT_VERSION_FILES "${TENSORRT_ROOT}/include/NvInferVersion.h")
+# 读取版本文件
+file(STRINGS ${TENSORRT_VERSION_FILES} TENSORRT_VERSION_LINES
+        LIMIT_COUNT 1  # 只读取第一行
+        REGEX "#define NV_TENSORRT_MAJOR [0-9]+"  # 匹配版本号定义行
+)
+message(STATUS "  TENSORRT_VERSION_LINES: ${TENSORRT_VERSION_LINES}")
+# 解析版本号
+string(REGEX REPLACE "#define NV_TENSORRT_MAJOR ([0-9]+)" "\\1" TENSORRT_VERSION_MAJOR ${TENSORRT_VERSION_LINES})
+message(STATUS "  TENSORRT_VERSION_MAJOR: ${TENSORRT_VERSION_MAJOR}")
+# 判断版本号是否大于等于10
+if (TENSORRT_VERSION_MAJOR GREATER_EQUAL 10)
+    message(STATUS "  TensorRT version is greater than or equal to 10.")
+    link_libraries(
+            opencv_core
+            opencv_highgui
+            opencv_imgproc
+            opencv_imgcodecs
+            cudart
+            cublas
+            nvinfer_10
+    )
+else ()
+    message(STATUS "  TensorRT version is less than 10.")
+    link_libraries(
+            opencv_core
+            opencv_highgui
+            opencv_imgproc
+            opencv_imgcodecs
+            cudart
+            cublas
+            nvinfer
+    )
+endif ()
+
+include_directories(${CMAKE_SOURCE_DIR}/include)
+include_directories(${CMAKE_SOURCE_DIR}/plugin)
+include_directories(${CMAKE_SOURCE_DIR}/src)
+link_directories(${CMAKE_SOURCE_DIR}/lib)
+
+add_definitions(-DNOMINMAX)
+
+add_definitions(-DAPI_EXPORTS)
+
+file(GLOB_RECURSE SRCS ${CMAKE_SOURCE_DIR}/src/*.cpp ${CMAKE_SOURCE_DIR}/src/*.cu)
+file(GLOB_RECURSE PLUGIN_SRCS ${PROJECT_SOURCE_DIR}/plugin/*.cu)
+
+add_library(myplugins SHARED ${PLUGIN_SRCS})
+target_link_libraries(myplugins nvinfer_10 nvinfer_plugin_10 cudart)
+
+add_executable(yolov8_cls yolov8_cls.cpp ${SRCS})
+target_link_libraries(yolov8_cls myplugins)
+
+add_executable(yolov8_det yolov8_det.cpp ${SRCS})
+target_link_libraries(yolov8_det nvinfer_10)
+target_link_libraries(yolov8_det cudart)
+target_link_libraries(yolov8_det myplugins)
+target_link_libraries(yolov8_det ${OpenCV_LIBS})
+
+add_executable(yolov8_seg yolov8_seg.cpp ${SRCS})
+target_link_libraries(yolov8_seg nvinfer_10)
+target_link_libraries(yolov8_seg cudart)
+target_link_libraries(yolov8_seg myplugins)
+target_link_libraries(yolov8_seg ${OpenCV_LIBS})
+
+add_executable(yolov8_pose yolov8_pose.cpp ${SRCS})
+target_link_libraries(yolov8_pose nvinfer_10)
+target_link_libraries(yolov8_pose cudart)
+target_link_libraries(yolov8_pose myplugins)
+target_link_libraries(yolov8_pose ${OpenCV_LIBS})
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..f8b0aca
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,57 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(yolov8)
+
+add_definitions(-std=c++11)
+add_definitions(-DAPI_EXPORTS)
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_BUILD_TYPE Debug)
+
+set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
+enable_language(CUDA)
+
+include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_SOURCE_DIR}/plugin)
+
+# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+  message("embed_platform on")
+  include_directories(/usr/local/cuda/targets/aarch64-linux/include)
+  link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
+else()
+  message("embed_platform off")
+
+  # cuda
+  include_directories(/usr/local/cuda/include)
+  link_directories(/usr/local/cuda/lib64)
+
+  # tensorrt
+  include_directories(/workspace/shared/TensorRT-10.2.0.19/include/)
+  link_directories(/workspace/shared/TensorRT-10.2.0.19/lib/)
+
+  # include_directories(/home/lindsay/TensorRT-7.2.3.4/include)
+  # link_directories(/home/lindsay/TensorRT-7.2.3.4/lib)
+endif()
+
+add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/plugin/yololayer.cu)
+target_link_libraries(myplugins nvinfer cudart)
+
+find_package(OpenCV)
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+file(GLOB_RECURSE SRCS ${PROJECT_SOURCE_DIR}/src/*.cpp ${PROJECT_SOURCE_DIR}/src/*.cu)
+add_executable(yolov8_det ${PROJECT_SOURCE_DIR}/yolov8_det.cpp ${SRCS})
+
+target_link_libraries(yolov8_det nvinfer)
+target_link_libraries(yolov8_det cudart)
+target_link_libraries(yolov8_det myplugins)
+target_link_libraries(yolov8_det ${OpenCV_LIBS})
+
+add_executable(yolov8_seg ${PROJECT_SOURCE_DIR}/yolov8_seg.cpp ${SRCS})
+target_link_libraries(yolov8_seg nvinfer cudart myplugins ${OpenCV_LIBS})
+
+add_executable(yolov8_pose ${PROJECT_SOURCE_DIR}/yolov8_pose.cpp ${SRCS})
+target_link_libraries(yolov8_pose nvinfer cudart myplugins ${OpenCV_LIBS})
+
+add_executable(yolov8_cls ${PROJECT_SOURCE_DIR}/yolov8_cls.cpp ${SRCS})
+target_link_libraries(yolov8_cls nvinfer cudart myplugins ${OpenCV_LIBS})
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ec08d80
--- /dev/null
+++ b/README.md
@@ -0,0 +1,143 @@
+## Introduce
+
+Yolov8 model supports TensorRT-10.
+
+## Environment
+
+CUDA: 11.8
+CUDNN: 8.9.1.23
+TensorRT: TensorRT-10.2.0.19
+
+## Support
+
+* [x] YOLOv8-cls support FP32/FP16/INT8 and Python/C++ API
+* [x] YOLOv8-det support FP32/FP16/INT8 and Python/C++ API
+* [x] YOLOv8-seg support FP32/FP16/INT8 and Python/C++ API
+* [x] YOLOv8-pose support FP32/FP16/INT8 and Python/C++ API
+
+## Config
+
+* Choose the YOLOv8 sub-model n/s/m/l/x/n6/s6/m6/l6/x6 from command line arguments.
+* Other configs please check [src/config.h](src/config.h)
+
+## Build and Run
+
+1. generate .wts from pytorch with .pt, or download .wts from model zoo
+
+```shell
+git clone https://gitclone.com/github.com/ultralytics/ultralytics.git
+git clone https://github.com/mpj1234/YOLOv8-series-TensorRT10.git
+cd YOLOv8-series-TensorRT10/
+wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n-cls.pt
+wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt
+wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n-seg.pt
+wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n-pose.pt
+cp [PATH-TO-YOLOv8-series-TensorRT10]/yolov8/gen_wts.py .
+python gen_wts.py -w yolov8n-cls.pt -o yolov8n-cls.wts -t cls
+python gen_wts.py -w yolov8n.pt -o yolov8n.wts
+python gen_wts.py -w yolov8n-seg.pt -o yolov8n-seg.wts -t seg
+python gen_wts.py -w yolov8n-pose.pt -o yolov8n-pose.wts -t pose
+# A file 'yolov8n.wts' will be generated.
+```
+
+2. build YOLOv8-series-TensorRT10 and run
+
+#### Classification
+
+```shell
+cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10
+# Update kNumClass in src/config.h if your model is trained on custom dataset
+mkdir build
+cd build
+cp [PATH-TO-ultralytics-yolov8]/yolov8sn-cls.wts .
+cmake ..
+make
+
+# Download ImageNet labels
+wget https://github.com/joannzhang00/ImageNet-dataset-classes-labels/blob/main/imagenet_classes.txt
+
+# Build and serialize TensorRT engine
+./yolov8_cls -s yolov8n-cls.wts yolov8n-cls.engine [n/s/m/l/x]
+
+# Run inference
+./yolov8_cls -d yolov8n-cls.engine ../images
+# The results are displayed in the console
+```
+
+3. Optional, load and run the tensorrt model in Python
+```shell
+// Install python-tensorrt, pycuda, etc.
+// Ensure the yolov8n-cls.engine
+python yolov8_cls_trt.py ./build/yolov8n-cls.engine ../images
+# faq: in windows bug pycuda._driver.LogicError
+# faq: in linux bug Segmentation fault
+# Add the following code to the py file:
+# import pycuda.autoinit
+# import pycuda.driver as cuda
+```
+
+#### Detection
+
+```shell
+cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10
+# Update kNumClass in src/config.h if your model is trained on custom dataset
+mkdir build
+cd build
+cp [PATH-TO-ultralytics-yolov8]/yolov8n.wts .
+cmake ..
+make
+
+# Build and serialize TensorRT engine
+./yolov8_det -s yolov8n.wts yolov8n.engine [n/s/m/l/x]
+
+# Run inference
+./yolov8_det -d yolov8n.engine ../images [c/g]
+# The results are displayed in the console
+```
+
+#### Segmentation
+
+```shell
+cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10
+# Update kNumClass in src/config.h if your model is trained on custom dataset
+mkdir build
+cd build
+cp [PATH-TO-ultralytics-yolov8]/yolov8n-seg.wts .
+cmake ..
+make
+
+# Build and serialize TensorRT engine
+./yolov8_seg -s yolov8n-seg.wts yolov8n-seg.engine [n/s/m/l/x]
+
+# Download the labels file
+wget -O coco.txt https://raw.githubusercontent.com/amikelive/coco-labels/master/coco-labels-2014_2017.txt
+
+# Run inference
+./yolov8_seg -d yolov8n-seg.engine ../images [c/g] coco.txt
+# The results are displayed in the console
+```
+
+#### Pose
+
+```shell
+cd [PATH-TO-YOLOv8-series-TensorRT10]/YOLOv8-series-TensorRT10
+# Update kNumClass in src/config.h if your model is trained on custom dataset
+mkdir build
+cd build
+cp [PATH-TO-ultralytics-yolov8]/yolov8n-pose.wts .
+cmake ..
+make
+
+# Build and serialize TensorRT engine
+./yolov8_seg -s yolov8n-pose.wts yolov8n-pose.engine [n/s/m/l/x]
+
+# Run inference
+./yolov8_seg -d yolov8n-seg.engine ../images c
+# The results are displayed in the console
+```
+
+## INT8 Quantization
+1. Prepare calibration images, you can randomly select 1000s images from your train set. For coco, you can also download my calibration images `coco_calib` from [GoogleDrive](https://drive.google.com/drive/folders/1s7jE9DtOngZMzJC1uL307J2MiaGwdRSI?usp=sharing) or [BaiduPan](https://pan.baidu.com/s/1GOm_-JobpyLMAqZWCDUhKg) pwd: a9wh
+2. unzip it in yolov8_trt10/build
+3. set the macro `USE_INT8` in src/config.h and make again
+4. serialize the model and test
diff --git a/gen_wts.py b/gen_wts.py
new file mode 100644
index 0000000..5f037db
--- /dev/null
+++ b/gen_wts.py
@@ -0,0 +1,57 @@
+import sys  # noqa: F401
+import argparse
+import os
+import struct
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert .pt file to .wts')
+    parser.add_argument('-w', '--weights', required=True,
+                        help='Input weights (.pt) file path (required)')
+    parser.add_argument(
+        '-o', '--output', help='Output (.wts) file path (optional)')
+    parser.add_argument(
+        '-t', '--type', type=str, default='detect', choices=['detect', 'cls', 'seg', 'pose'],
+        help='determines the model is detection/classification')
+    args = parser.parse_args()
+    if not os.path.isfile(args.weights):
+        raise SystemExit('Invalid input file')
+    if not args.output:
+        args.output = os.path.splitext(args.weights)[0] + '.wts'
+    elif os.path.isdir(args.output):
+        args.output = os.path.join(
+            args.output,
+            os.path.splitext(os.path.basename(args.weights))[0] + '.wts')
+    return args.weights, args.output, args.type
+
+
+pt_file, wts_file, m_type = parse_args()
+
+print(f'Generating .wts for {m_type} model')
+
+# Load model
+print(f'Loading {pt_file}')
+
+# Initialize
+device = 'cpu'
+
+# Load model
+model = torch.load(pt_file, map_location=device)['model'].float()  # load to FP32
+
+if m_type in ['detect', 'seg', 'pose']:
+    anchor_grid = model.model[-1].anchors * model.model[-1].stride[..., None, None]
+
+    delattr(model.model[-1], 'anchors')
+
+model.to(device).eval()
+
+with open(wts_file, 'w') as f:
+    f.write('{}\n'.format(len(model.state_dict().keys())))
+    for k, v in model.state_dict().items():
+        vr = v.reshape(-1).cpu().numpy()
+        f.write('{} {} '.format(k, len(vr)))
+        for vv in vr:
+            f.write(' ')
+            f.write(struct.pack('>f', float(vv)).hex())
+        f.write('\n')
diff --git a/images/bus.jpg b/images/bus.jpg
new file mode 100644
index 0000000..40eaaf5
Binary files /dev/null and b/images/bus.jpg differ
diff --git a/images/cat.jpg b/images/cat.jpg
new file mode 100644
index 0000000..df4a907
Binary files /dev/null and b/images/cat.jpg differ
diff --git a/images/dog.jpg b/images/dog.jpg
new file mode 100644
index 0000000..e76c295
Binary files /dev/null and b/images/dog.jpg differ
diff --git a/images/zidane.jpg b/images/zidane.jpg
new file mode 100644
index 0000000..eeab1cd
Binary files /dev/null and b/images/zidane.jpg differ
diff --git a/include/block.h b/include/block.h
new file mode 100644
index 0000000..1816e01
--- /dev/null
+++ b/include/block.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+#include "NvInfer.h"
+
+std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file);
+
+nvinfer1::IScaleLayer *addBatchNorm2d(nvinfer1::INetworkDefinition *network,
+                                      std::map<std::string, nvinfer1::Weights> weightMap,
+                                      nvinfer1::ITensor &input, std::string lname, float eps);
+
+nvinfer1::IElementWiseLayer *convBnSiLU(nvinfer1::INetworkDefinition *network,
+                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor &input,
+                                        int ch, int k, int s, int p, std::string lname);
+
+nvinfer1::IElementWiseLayer *C2F(nvinfer1::INetworkDefinition *network,
+                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor &input, int c1,
+                                 int c2, int n, bool shortcut, float e, std::string lname);
+
+nvinfer1::IElementWiseLayer *C2(nvinfer1::INetworkDefinition *network,
+                                std::map<std::string, nvinfer1::Weights> &weightMap, nvinfer1::ITensor &input, int c1,
+                                int c2, int n, bool shortcut, float e, std::string lname);
+
+nvinfer1::IElementWiseLayer *SPPF(nvinfer1::INetworkDefinition *network,
+                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor &input, int c1,
+                                  int c2, int k, std::string lname);
+
+nvinfer1::IShuffleLayer *DFL(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights> weightMap,
+                             nvinfer1::ITensor &input, int ch, int grid, int k, int s, int p, std::string lname);
+
+nvinfer1::IPluginV2Layer *addYoLoLayer(nvinfer1::INetworkDefinition *network,
+                                       std::vector<nvinfer1::IConcatenationLayer *> dets, const int *px_arry,
+                                       int px_arry_num, bool is_segmentation, bool is_pose);
diff --git a/include/calibrator.h b/include/calibrator.h
new file mode 100644
index 0000000..9bb60a7
--- /dev/null
+++ b/include/calibrator.h
@@ -0,0 +1,39 @@
+#ifndef ENTROPY_CALIBRATOR_H
+#define ENTROPY_CALIBRATOR_H
+
+#include <NvInfer.h>
+#include <string>
+#include <vector>
+#include "macros.h"
+
+//! \class Int8EntropyCalibrator2
+//!
+//! \brief Implements Entropy calibrator 2.
+//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
+//!
+class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
+{
+public:
+    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);
+    virtual ~Int8EntropyCalibrator2();
+    int getBatchSize() const TRT_NOEXCEPT override;
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT override;
+    const void* readCalibrationCache(size_t& length) TRT_NOEXCEPT override;
+    void writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT override;
+
+private:
+    int batchsize_;
+    int input_w_;
+    int input_h_;
+    int img_idx_;
+    std::string img_dir_;
+    std::vector<std::string> img_files_;
+    size_t input_count_;
+    std::string calib_table_name_;
+    const char* input_blob_name_;
+    bool read_cache_;
+    void* device_input_;
+    std::vector<char> calib_cache_;
+};
+
+#endif // ENTROPY_CALIBRATOR_H
diff --git a/include/config.h b/include/config.h
new file mode 100644
index 0000000..e9c70c1
--- /dev/null
+++ b/include/config.h
@@ -0,0 +1,27 @@
+// #define USE_FP16
+// #define USE_FP32
+#define USE_INT8
+
+const static char* kInputTensorName = "images";
+const static char* kOutputTensorName = "output";
+const static char *kProtoTensorName = "proto";
+const static int kNumClass = 80;
+const static int kPoseNumClass = 1;
+const static int kNumberOfPoints = 17;  // number of keypoints total
+const static int kBatchSize = 1;
+const static int kGpuId = 0;
+const static int kInputH = 640;
+const static int kInputW = 640;
+const static float kNmsThresh = 0.45f;
+const static float kConfThresh = 0.5f;
+const static float kConfThreshKeypoints = 0.5f;  // keypoints confidence
+const static int kMaxInputImageSize = 3000 * 3000;
+const static int kMaxNumOutputBbox = 1000;
+//Quantization input image folder path
+const static char* kInputQuantizationFolder = "./coco_calib";
+
+// Classfication model's number of classes
+constexpr static int kClsNumClass = 1000;
+// Classfication model's input shape
+constexpr static int kClsInputH = 224;
+constexpr static int kClsInputW = 224;
diff --git a/include/cuda_utils.h b/include/cuda_utils.h
new file mode 100644
index 0000000..8fbd319
--- /dev/null
+++ b/include/cuda_utils.h
@@ -0,0 +1,18 @@
+#ifndef TRTX_CUDA_UTILS_H_
+#define TRTX_CUDA_UTILS_H_
+
+#include <cuda_runtime_api.h>
+
+#ifndef CUDA_CHECK
+#define CUDA_CHECK(callstr)\
+    {\
+        cudaError_t error_code = callstr;\
+        if (error_code != cudaSuccess) {\
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__;\
+            assert(0);\
+        }\
+    }
+#endif  // CUDA_CHECK
+
+#endif  // TRTX_CUDA_UTILS_H_
+
diff --git a/include/logging.h b/include/logging.h
new file mode 100644
index 0000000..6b79a8b
--- /dev/null
+++ b/include/logging.h
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_LOGGING_H
+#define TENSORRT_LOGGING_H
+
+#include "NvInferRuntimeCommon.h"
+#include <cassert>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include "macros.h"
+
+using Severity = nvinfer1::ILogger::Severity;
+
+class LogStreamConsumerBuffer : public std::stringbuf
+{
+public:
+    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mOutput(stream)
+        , mPrefix(prefix)
+        , mShouldLog(shouldLog)
+    {
+    }
+
+    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
+        : mOutput(other.mOutput)
+    {
+    }
+
+    ~LogStreamConsumerBuffer()
+    {
+        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
+        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
+        // if the pointer to the beginning is not equal to the pointer to the current position,
+        // call putOutput() to log the output to the stream
+        if (pbase() != pptr())
+        {
+            putOutput();
+        }
+    }
+
+    // synchronizes the stream buffer and returns 0 on success
+    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
+    // resetting the buffer and flushing the stream
+    virtual int sync()
+    {
+        putOutput();
+        return 0;
+    }
+
+    void putOutput()
+    {
+        if (mShouldLog)
+        {
+            // prepend timestamp
+            std::time_t timestamp = std::time(nullptr);
+            tm* tm_local = std::localtime(&timestamp);
+            std::cout << "[";
+            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
+            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
+            // std::stringbuf::str() gets the string contents of the buffer
+            // insert the buffer contents pre-appended by the appropriate prefix into the stream
+            mOutput << mPrefix << str();
+            // set the buffer to empty
+            str("");
+            // flush the stream
+            mOutput.flush();
+        }
+    }
+
+    void setShouldLog(bool shouldLog)
+    {
+        mShouldLog = shouldLog;
+    }
+
+private:
+    std::ostream& mOutput;
+    std::string mPrefix;
+    bool mShouldLog;
+};
+
+//!
+//! \class LogStreamConsumerBase
+//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
+//!
+class LogStreamConsumerBase
+{
+public:
+    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mBuffer(stream, prefix, shouldLog)
+    {
+    }
+
+protected:
+    LogStreamConsumerBuffer mBuffer;
+};
+
+//!
+//! \class LogStreamConsumer
+//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
+//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
+//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
+//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
+//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
+//!  Please do not change the order of the parent classes.
+//!
+class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
+{
+public:
+    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
+    //!  Reportable severity determines if the messages are severe enough to be logged.
+    LogStreamConsumer(Severity reportableSeverity, Severity severity)
+        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(severity <= reportableSeverity)
+        , mSeverity(severity)
+    {
+    }
+
+    LogStreamConsumer(LogStreamConsumer&& other)
+        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(other.mShouldLog)
+        , mSeverity(other.mSeverity)
+    {
+    }
+
+    void setReportableSeverity(Severity reportableSeverity)
+    {
+        mShouldLog = mSeverity <= reportableSeverity;
+        mBuffer.setShouldLog(mShouldLog);
+    }
+
+private:
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    static std::string severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    bool mShouldLog;
+    Severity mSeverity;
+};
+
+//! \class Logger
+//!
+//! \brief Class which manages logging of TensorRT tools and samples
+//!
+//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
+//! and supports logging two types of messages:
+//!
+//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
+//! - Test pass/fail messages
+//!
+//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
+//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
+//!
+//! In the future, this class could be extended to support dumping test results to a file in some standard format
+//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
+//!
+//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
+//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
+//! library and messages coming from the sample.
+//!
+//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
+//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
+//! object.
+
+class Logger : public nvinfer1::ILogger
+{
+public:
+    Logger(Severity severity = Severity::kWARNING)
+        : mReportableSeverity(severity)
+    {
+    }
+
+    //!
+    //! \enum TestResult
+    //! \brief Represents the state of a given test
+    //!
+    enum class TestResult
+    {
+        kRUNNING, //!< The test is running
+        kPASSED,  //!< The test passed
+        kFAILED,  //!< The test failed
+        kWAIVED   //!< The test was waived
+    };
+
+    //!
+    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
+    //! \return The nvinfer1::ILogger associated with this Logger
+    //!
+    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
+    //! we can eliminate the inheritance of Logger from ILogger
+    //!
+    nvinfer1::ILogger& getTRTLogger()
+    {
+        return *this;
+    }
+
+    //!
+    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
+    //!
+    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
+    //! inheritance from nvinfer1::ILogger
+    //!
+    void log(Severity severity, const char* msg) TRT_NOEXCEPT override 
+    {
+        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
+    }
+
+    //!
+    //! \brief Method for controlling the verbosity of logging output
+    //!
+    //! \param severity The logger will only emit messages that have severity of this level or higher.
+    //!
+    void setReportableSeverity(Severity severity)
+    {
+        mReportableSeverity = severity;
+    }
+
+    //!
+    //! \brief Opaque handle that holds logging information for a particular test
+    //!
+    //! This object is an opaque handle to information used by the Logger to print test results.
+    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
+    //! with Logger::reportTest{Start,End}().
+    //!
+    class TestAtom
+    {
+    public:
+        TestAtom(TestAtom&&) = default;
+
+    private:
+        friend class Logger;
+
+        TestAtom(bool started, const std::string& name, const std::string& cmdline)
+            : mStarted(started)
+            , mName(name)
+            , mCmdline(cmdline)
+        {
+        }
+
+        bool mStarted;
+        std::string mName;
+        std::string mCmdline;
+    };
+
+    //!
+    //! \brief Define a test for logging
+    //!
+    //! \param[in] name The name of the test.  This should be a string starting with
+    //!                  "TensorRT" and containing dot-separated strings containing
+    //!                  the characters [A-Za-z0-9_].
+    //!                  For example, "TensorRT.sample_googlenet"
+    //! \param[in] cmdline The command line used to reproduce the test
+    //
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    //!
+    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
+    {
+        return TestAtom(false, name, cmdline);
+    }
+
+    //!
+    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
+    //!        as input
+    //!
+    //! \param[in] name The name of the test
+    //! \param[in] argc The number of command-line arguments
+    //! \param[in] argv The array of command-line arguments (given as C strings)
+    //!
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
+    {
+        auto cmdline = genCmdlineString(argc, argv);
+        return defineTest(name, cmdline);
+    }
+
+    //!
+    //! \brief Report that a test has started.
+    //!
+    //! \pre reportTestStart() has not been called yet for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has started
+    //!
+    static void reportTestStart(TestAtom& testAtom)
+    {
+        reportTestResult(testAtom, TestResult::kRUNNING);
+        assert(!testAtom.mStarted);
+        testAtom.mStarted = true;
+    }
+
+    //!
+    //! \brief Report that a test has ended.
+    //!
+    //! \pre reportTestStart() has been called for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has ended
+    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
+    //!                   TestResult::kFAILED, TestResult::kWAIVED
+    //!
+    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
+    {
+        assert(result != TestResult::kRUNNING);
+        assert(testAtom.mStarted);
+        reportTestResult(testAtom, result);
+    }
+
+    static int reportPass(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kPASSED);
+        return EXIT_SUCCESS;
+    }
+
+    static int reportFail(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kFAILED);
+        return EXIT_FAILURE;
+    }
+
+    static int reportWaive(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kWAIVED);
+        return EXIT_SUCCESS;
+    }
+
+    static int reportTest(const TestAtom& testAtom, bool pass)
+    {
+        return pass ? reportPass(testAtom) : reportFail(testAtom);
+    }
+
+    Severity getReportableSeverity() const
+    {
+        return mReportableSeverity;
+    }
+
+private:
+    //!
+    //! \brief returns an appropriate string for prefixing a log message with the given severity
+    //!
+    static const char* severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate string for prefixing a test result message with the given result
+    //!
+    static const char* testResultString(TestResult result)
+    {
+        switch (result)
+        {
+        case TestResult::kRUNNING: return "RUNNING";
+        case TestResult::kPASSED: return "PASSED";
+        case TestResult::kFAILED: return "FAILED";
+        case TestResult::kWAIVED: return "WAIVED";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
+    //!
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    //!
+    //! \brief method that implements logging test results
+    //!
+    static void reportTestResult(const TestAtom& testAtom, TestResult result)
+    {
+        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
+                                         << testAtom.mCmdline << std::endl;
+    }
+
+    //!
+    //! \brief generate a command line string from the given (argc, argv) values
+    //!
+    static std::string genCmdlineString(int argc, char const* const* argv)
+    {
+        std::stringstream ss;
+        for (int i = 0; i < argc; i++)
+        {
+            if (i > 0)
+                ss << " ";
+            ss << argv[i];
+        }
+        return ss.str();
+    }
+
+    Severity mReportableSeverity;
+};
+
+namespace
+{
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
+//!
+//! Example usage:
+//!
+//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
+//!
+//! Example usage:
+//!
+//!     LOG_INFO(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_INFO(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
+//!
+//! Example usage:
+//!
+//!     LOG_WARN(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_WARN(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
+//!
+//! Example usage:
+//!
+//!     LOG_ERROR(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_ERROR(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
+//         ("fatal" severity)
+//!
+//! Example usage:
+//!
+//!     LOG_FATAL(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_FATAL(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
+}
+
+} // anonymous namespace
+
+#endif // TENSORRT_LOGGING_H
diff --git a/include/macros.h b/include/macros.h
new file mode 100644
index 0000000..b187c94
--- /dev/null
+++ b/include/macros.h
@@ -0,0 +1,29 @@
+#ifndef __MACROS_H
+#define __MACROS_H
+
+#include "NvInfer.h"
+
+#ifdef API_EXPORTS
+#if defined(_MSC_VER)
+#define API __declspec(dllexport)
+#else
+#define API __attribute__((visibility("default")))
+#endif
+#else
+
+#if defined(_MSC_VER)
+#define API __declspec(dllimport)
+#else
+#define API
+#endif
+#endif  // API_EXPORTS
+
+#if NV_TENSORRT_MAJOR >= 8
+#define TRT_NOEXCEPT noexcept
+#define TRT_CONST_ENQUEUE const
+#else
+#define TRT_NOEXCEPT
+#define TRT_CONST_ENQUEUE
+#endif
+
+#endif  // __MACROS_H
diff --git a/include/model.h b/include/model.h
new file mode 100644
index 0000000..ad865df
--- /dev/null
+++ b/include/model.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <assert.h>
+#include <string>
+#include "NvInfer.h"
+
+nvinfer1::IHostMemory *buildEngineYolov8Cls(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
+                                            nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw);
+
+nvinfer1::IHostMemory *buildEngineYolov8Det(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
+                                            nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw,
+                                            int &max_channels);
+
+nvinfer1::IHostMemory *buildEngineYolov8DetP6(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
+                                              nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw,
+                                              int &max_channels);
+
+nvinfer1::IHostMemory *buildEngineYolov8DetP2(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
+                                              nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw,
+                                              int &max_channels);
+
+nvinfer1::IHostMemory *buildEngineYolov8Seg(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
+                                            nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw,
+                                            int &max_channels);
+
+nvinfer1::IHostMemory *buildEngineYolov8Pose(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
+                                             nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw,
+                                             int &max_channels);
+
+nvinfer1::IHostMemory *buildEngineYolov8PoseP6(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
+                                               nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw,
+                                               int &max_channels);
\ No newline at end of file
diff --git a/include/postprocess.h b/include/postprocess.h
new file mode 100644
index 0000000..eb18d54
--- /dev/null
+++ b/include/postprocess.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <opencv2/opencv.hpp>
+#include "NvInfer.h"
+#include "types.h"
+
+cv::Rect get_rect(cv::Mat& img, float bbox[4]);
+
+void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh = 0.5);
+
+void batch_nms(std::vector<std::vector<Detection>>& batch_res, float* output, int batch_size, int output_size,
+               float conf_thresh, float nms_thresh = 0.5);
+
+void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
+
+void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch);
+
+void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
+                   int bbox_element, const std::vector<cv::Mat>& img_batch);
+
+void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
+                             int count);
+
+void cuda_decode(float* predict, int num_bboxes, float confidence_threshold, float* parray, int max_objects,
+                 cudaStream_t stream);
+
+void cuda_nms(float* parray, float nms_threshold, int max_objects, cudaStream_t stream);
+
+void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
+                    std::unordered_map<int, std::string>& labels_map);
diff --git a/include/preprocess.h b/include/preprocess.h
new file mode 100644
index 0000000..10bead9
--- /dev/null
+++ b/include/preprocess.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <opencv2/opencv.hpp>
+#include "NvInfer.h"
+#include "types.h"
+#include <map>
+
+
+void cuda_preprocess_init(int max_image_size);
+
+void cuda_preprocess_destroy();
+
+void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height, cudaStream_t stream);
+
+void cuda_batch_preprocess(std::vector<cv::Mat> &img_batch, float *dst, int dst_width, int dst_height, cudaStream_t stream);
+
diff --git a/include/types.h b/include/types.h
new file mode 100644
index 0000000..472c735
--- /dev/null
+++ b/include/types.h
@@ -0,0 +1,18 @@
+#pragma once
+#include "config.h"
+
+struct alignas(float) Detection {
+    //center_x center_y w h
+    float bbox[4];
+    float conf;  // bbox_conf * cls_conf
+    float class_id;
+    float mask[32];
+    float keypoints[51];  // 17*3 keypoints
+};
+
+struct AffineMatrix {
+    float value[6];
+};
+
+const int bbox_element =
+        sizeof(AffineMatrix) / sizeof(float) + 1;  // left, top, right, bottom, confidence, class, keepflag
diff --git a/include/utils.h b/include/utils.h
new file mode 100644
index 0000000..610c8e2
--- /dev/null
+++ b/include/utils.h
@@ -0,0 +1,86 @@
+#pragma once
+#include <opencv2/opencv.hpp>
+#include <dirent.h>
+#include <fstream>  
+
+static inline cv::Mat preprocess_img(cv::Mat& img, int input_w, int input_h) {
+    int w, h, x, y;
+    float r_w = input_w / (img.cols*1.0);
+    float r_h = input_h / (img.rows*1.0);
+    if (r_h > r_w) {
+        w = input_w;
+        h = r_w * img.rows;
+        x = 0;
+        y = (input_h - h) / 2;
+    } else {
+        w = r_h * img.cols;
+        h = input_h;
+        x = (input_w - w) / 2;
+        y = 0;
+    }
+    cv::Mat re(h, w, CV_8UC3);
+    cv::resize(img, re, re.size(), 0, 0, cv::INTER_LINEAR);
+    cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(128, 128, 128));
+    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
+    return out;
+}
+
+static inline int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
+    DIR *p_dir = opendir(p_dir_name);
+    if (p_dir == nullptr) {
+        return -1;
+    }
+
+    struct dirent* p_file = nullptr;
+    while ((p_file = readdir(p_dir)) != nullptr) {
+        if (strcmp(p_file->d_name, ".") != 0 &&
+            strcmp(p_file->d_name, "..") != 0) {
+            //std::string cur_file_name(p_dir_name);
+            //cur_file_name += "/";
+            //cur_file_name += p_file->d_name;
+            std::string cur_file_name(p_file->d_name);
+            file_names.push_back(cur_file_name);
+        }
+    }
+
+    closedir(p_dir);
+    return 0;
+}
+
+// Function to trim leading and trailing whitespace from a string
+static inline std::string trim_leading_whitespace(const std::string& str) {
+    size_t first = str.find_first_not_of(' ');
+    if (std::string::npos == first) {
+        return str;
+    }
+    size_t last = str.find_last_not_of(' ');
+    return str.substr(first, (last - first + 1));
+}
+
+// Src: https://stackoverflow.com/questions/16605967
+static inline std::string to_string_with_precision(const float a_value, const int n = 2) {
+    std::ostringstream out;
+    out.precision(n);
+    out << std::fixed << a_value;
+    return out.str();
+}
+
+static inline int read_labels(const std::string labels_filename, std::unordered_map<int, std::string>& labels_map) {
+    std::ifstream file(labels_filename);
+    // Read each line of the file
+    std::string line;
+    int index = 0;
+    while (std::getline(file, line)) {
+        // Strip the line of any leading or trailing whitespace
+        line = trim_leading_whitespace(line);
+
+        // Add the stripped line to the labels_map, using the loop index as the key
+        labels_map[index] = line;
+        index++;
+    }
+    // Close the file
+    file.close();
+
+    return 0;
+}
+
diff --git a/plugin/yololayer.cu b/plugin/yololayer.cu
new file mode 100644
index 0000000..c42b841
--- /dev/null
+++ b/plugin/yololayer.cu
@@ -0,0 +1,347 @@
+#include <assert.h>
+#include <math.h>
+#include <iostream>
+#include <vector>
+#include "cuda_utils.h"
+#include "types.h"
+#include "yololayer.h"
+
+namespace Tn {
+template <typename T>
+void write(char*& buffer, const T& val) {
+    *reinterpret_cast<T*>(buffer) = val;
+    buffer += sizeof(T);
+}
+
+template <typename T>
+void read(const char*& buffer, T& val) {
+    val = *reinterpret_cast<const T*>(buffer);
+    buffer += sizeof(T);
+}
+}  // namespace Tn
+
+__device__ float sigmoid(float x) {
+    return 1.0f / (1.0f + exp(-x));
+}
+
+namespace nvinfer1 {
+YoloLayerPlugin::YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth,
+                                 int netHeight, int maxOut, bool is_segmentation, bool is_pose, const int* strides,
+                                 int stridesLength) {
+
+    mClassCount = classCount;
+    mNumberofpoints = numberofpoints;
+    mConfthreshkeypoints = confthreshkeypoints;
+    mYoloV8NetWidth = netWidth;
+    mYoloV8netHeight = netHeight;
+    mMaxOutObject = maxOut;
+    mStridesLength = stridesLength;
+    mStrides = new int[stridesLength];
+    memcpy(mStrides, strides, stridesLength * sizeof(int));
+    is_segmentation_ = is_segmentation;
+    is_pose_ = is_pose;
+}
+
+YoloLayerPlugin::~YoloLayerPlugin() {
+    if (mStrides != nullptr) {
+        delete[] mStrides;
+        mStrides = nullptr;
+    }
+}
+
+YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) {
+    using namespace Tn;
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
+    read(d, mClassCount);
+    read(d, mNumberofpoints);
+    read(d, mConfthreshkeypoints);
+    read(d, mThreadCount);
+    read(d, mYoloV8NetWidth);
+    read(d, mYoloV8netHeight);
+    read(d, mMaxOutObject);
+    read(d, mStridesLength);
+    mStrides = new int[mStridesLength];
+    for (int i = 0; i < mStridesLength; ++i) {
+        read(d, mStrides[i]);
+    }
+    read(d, is_segmentation_);
+    read(d, is_pose_);
+
+    assert(d == a + length);
+}
+
+void YoloLayerPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
+
+    using namespace Tn;
+    char *d = static_cast<char*>(buffer), *a = d;
+    write(d, mClassCount);
+    write(d, mNumberofpoints);
+    write(d, mConfthreshkeypoints);
+    write(d, mThreadCount);
+    write(d, mYoloV8NetWidth);
+    write(d, mYoloV8netHeight);
+    write(d, mMaxOutObject);
+    write(d, mStridesLength);
+    for (int i = 0; i < mStridesLength; ++i) {
+        write(d, mStrides[i]);
+    }
+    write(d, is_segmentation_);
+    write(d, is_pose_);
+
+    assert(d == a + getSerializationSize());
+}
+
+size_t YoloLayerPlugin::getSerializationSize() const TRT_NOEXCEPT {
+    return sizeof(mClassCount) + sizeof(mNumberofpoints) + sizeof(mConfthreshkeypoints) + sizeof(mThreadCount) +
+           sizeof(mYoloV8netHeight) + sizeof(mYoloV8NetWidth) + sizeof(mMaxOutObject) + sizeof(mStridesLength) +
+           sizeof(int) * mStridesLength + sizeof(is_segmentation_) + sizeof(is_pose_);
+}
+
+int YoloLayerPlugin::initialize() TRT_NOEXCEPT {
+    return 0;
+}
+
+nvinfer1::Dims YoloLayerPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                                    int nbInputDims) TRT_NOEXCEPT {
+    int total_size = mMaxOutObject * sizeof(Detection) / sizeof(float);
+    return nvinfer1::Dims3(total_size + 1, 1, 1);
+}
+
+void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT {
+    mPluginNamespace = pluginNamespace;
+}
+
+const char* YoloLayerPlugin::getPluginNamespace() const TRT_NOEXCEPT {
+    return mPluginNamespace;
+}
+
+nvinfer1::DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
+                                                      int nbInputs) const TRT_NOEXCEPT {
+    return nvinfer1::DataType::kFLOAT;
+}
+
+bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
+                                                   int nbInputs) const TRT_NOEXCEPT {
+
+    return false;
+}
+
+bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT {
+
+    return false;
+}
+
+void YoloLayerPlugin::configurePlugin(nvinfer1::PluginTensorDesc const* in, int nbInput,
+                                      nvinfer1::PluginTensorDesc const* out, int nbOutput) TRT_NOEXCEPT{};
+
+void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                                      IGpuAllocator* gpuAllocator) TRT_NOEXCEPT{};
+
+void YoloLayerPlugin::detachFromContext() TRT_NOEXCEPT {}
+
+const char* YoloLayerPlugin::getPluginType() const TRT_NOEXCEPT {
+
+    return "YoloLayer_TRT";
+}
+
+const char* YoloLayerPlugin::getPluginVersion() const TRT_NOEXCEPT {
+    return "1";
+}
+
+void YoloLayerPlugin::destroy() TRT_NOEXCEPT {
+    delete this;
+}
+
+nvinfer1::IPluginV2IOExt* YoloLayerPlugin::clone() const TRT_NOEXCEPT {
+
+    YoloLayerPlugin* p =
+            new YoloLayerPlugin(mClassCount, mNumberofpoints, mConfthreshkeypoints, mYoloV8NetWidth, mYoloV8netHeight,
+                                mMaxOutObject, is_segmentation_, is_pose_, mStrides, mStridesLength);
+    p->setPluginNamespace(mPluginNamespace);
+    return p;
+}
+
+int YoloLayerPlugin::enqueue(int batchSize, const void* TRT_CONST_ENQUEUE* inputs, void* const* outputs,
+                             void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
+
+    forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, mYoloV8netHeight, mYoloV8NetWidth, batchSize);
+    return 0;
+}
+
+__device__ float Logist(float data) {
+    return 1.0f / (1.0f + expf(-data));
+};
+
+__global__ void CalDetection(const float* input, float* output, int numElements, int maxoutobject, const int grid_h,
+                             int grid_w, const int stride, int classes, int nk, float confkeypoints, int outputElem,
+                             bool is_segmentation, bool is_pose) {
+    int idx = threadIdx.x + blockDim.x * blockIdx.x;
+    if (idx >= numElements)
+        return;
+
+    const int N_kpts = nk;
+    int total_grid = grid_h * grid_w;
+    int info_len = 4 + classes + (is_segmentation ? 32 : 0) + (is_pose ? N_kpts * 3 : 0);
+    int batchIdx = idx / total_grid;
+    int elemIdx = idx % total_grid;
+    const float* curInput = input + batchIdx * total_grid * info_len;
+    int outputIdx = batchIdx * outputElem;
+
+    int class_id = 0;
+    float max_cls_prob = 0.0;
+    for (int i = 4; i < 4 + classes; i++) {
+        float p = Logist(curInput[elemIdx + i * total_grid]);
+        if (p > max_cls_prob) {
+            max_cls_prob = p;
+            class_id = i - 4;
+        }
+    }
+
+    if (max_cls_prob < 0.1)
+        return;
+
+    int count = (int)atomicAdd(output + outputIdx, 1);
+    if (count >= maxoutobject)
+        return;
+    char* data = (char*)(output + outputIdx) + sizeof(float) + count * sizeof(Detection);
+    Detection* det = (Detection*)(data);
+
+    int row = elemIdx / grid_w;
+    int col = elemIdx % grid_w;
+
+    det->conf = max_cls_prob;
+    det->class_id = class_id;
+    det->bbox[0] = (col + 0.5f - curInput[elemIdx + 0 * total_grid]) * stride;
+    det->bbox[1] = (row + 0.5f - curInput[elemIdx + 1 * total_grid]) * stride;
+    det->bbox[2] = (col + 0.5f + curInput[elemIdx + 2 * total_grid]) * stride;
+    det->bbox[3] = (row + 0.5f + curInput[elemIdx + 3 * total_grid]) * stride;
+
+    if (is_segmentation) {
+        for (int k = 0; k < 32; ++k) {
+            det->mask[k] = curInput[elemIdx + (4 + classes + k) * total_grid];
+        }
+    }
+
+    if (is_pose) {
+        for (int kpt = 0; kpt < N_kpts; kpt++) {
+            int kpt_x_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3) * total_grid;
+            int kpt_y_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 1) * total_grid;
+            int kpt_conf_idx = (4 + classes + (is_segmentation ? 32 : 0) + kpt * 3 + 2) * total_grid;
+
+            float kpt_confidence = sigmoid(curInput[elemIdx + kpt_conf_idx]);
+
+            float kpt_x = (curInput[elemIdx + kpt_x_idx] * 2.0 + col) * stride;
+            float kpt_y = (curInput[elemIdx + kpt_y_idx] * 2.0 + row) * stride;
+
+            bool is_within_bbox =
+                    kpt_x >= det->bbox[0] && kpt_x <= det->bbox[2] && kpt_y >= det->bbox[1] && kpt_y <= det->bbox[3];
+
+            if (kpt_confidence < confkeypoints || !is_within_bbox) {
+                det->keypoints[kpt * 3] = -1;
+                det->keypoints[kpt * 3 + 1] = -1;
+                det->keypoints[kpt * 3 + 2] = -1;
+            } else {
+                det->keypoints[kpt * 3] = kpt_x;
+                det->keypoints[kpt * 3 + 1] = kpt_y;
+                det->keypoints[kpt * 3 + 2] = kpt_confidence;
+            }
+        }
+    }
+}
+
+void YoloLayerPlugin::forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
+                                 int mYoloV8NetWidth, int batchSize) {
+    int outputElem = 1 + mMaxOutObject * sizeof(Detection) / sizeof(float);
+    cudaMemsetAsync(output, 0, sizeof(float), stream);
+    for (int idx = 0; idx < batchSize; ++idx) {
+        CUDA_CHECK(cudaMemsetAsync(output + idx * outputElem, 0, sizeof(float), stream));
+    }
+    int numElem = 0;
+
+    //    const int maxGrids = mStridesLength;
+    //    int grids[maxGrids][2];
+    //    for (int i = 0; i < maxGrids; ++i) {
+    //        grids[i][0] = mYoloV8netHeight / mStrides[i];
+    //        grids[i][1] = mYoloV8NetWidth / mStrides[i];
+    //    }
+
+    int maxGrids = mStridesLength;
+    int flatGridsLen = 2 * maxGrids;
+    int* flatGrids = new int[flatGridsLen];
+
+    for (int i = 0; i < maxGrids; ++i) {
+        flatGrids[2 * i] = mYoloV8netHeight / mStrides[i];
+        flatGrids[2 * i + 1] = mYoloV8NetWidth / mStrides[i];
+    }
+
+    for (unsigned int i = 0; i < maxGrids; i++) {
+        // Access the elements of the original 2D array from the flattened 1D array
+        int grid_h = flatGrids[2 * i];      // Corresponds to the access of grids[i][0]
+        int grid_w = flatGrids[2 * i + 1];  // Corresponds to the access of grids[i][1]
+        int stride = mStrides[i];
+        numElem = grid_h * grid_w * batchSize;  // Calculate the total number of elements
+        if (numElem < mThreadCount)             // Adjust the thread count if needed
+            mThreadCount = numElem;
+
+        // The CUDA kernel call remains unchanged
+        CalDetection<<<(numElem + mThreadCount - 1) / mThreadCount, mThreadCount, 0, stream>>>(
+                inputs[i], output, numElem, mMaxOutObject, grid_h, grid_w, stride, mClassCount, mNumberofpoints,
+                mConfthreshkeypoints, outputElem, is_segmentation_, is_pose_);
+    }
+
+    delete[] flatGrids;
+}
+
+PluginFieldCollection YoloPluginCreator::mFC{};
+std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
+
+YoloPluginCreator::YoloPluginCreator() {
+    mPluginAttributes.clear();
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* YoloPluginCreator::getPluginName() const TRT_NOEXCEPT {
+    return "YoloLayer_TRT";
+}
+
+const char* YoloPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
+    return "1";
+}
+
+const PluginFieldCollection* YoloPluginCreator::getFieldNames() TRT_NOEXCEPT {
+    return &mFC;
+}
+
+IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
+    assert(fc->nbFields == 1);
+    assert(strcmp(fc->fields[0].name, "combinedInfo") == 0);
+    const int* combinedInfo = static_cast<const int*>(fc->fields[0].data);
+    int netinfo_count = 8;
+    int class_count = combinedInfo[0];
+    int numberofpoints = combinedInfo[1];
+    float confthreshkeypoints = combinedInfo[2];
+    int input_w = combinedInfo[3];
+    int input_h = combinedInfo[4];
+    int max_output_object_count = combinedInfo[5];
+    bool is_segmentation = combinedInfo[6];
+    bool is_pose = combinedInfo[7];
+    const int* px_arry = combinedInfo + netinfo_count;
+    int px_arry_length = fc->fields[0].length - netinfo_count;
+    YoloLayerPlugin* obj =
+            new YoloLayerPlugin(class_count, numberofpoints, confthreshkeypoints, input_w, input_h,
+                                max_output_object_count, is_segmentation, is_pose, px_arry, px_arry_length);
+    obj->setPluginNamespace(mNamespace.c_str());
+    return obj;
+}
+
+IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData,
+                                                     size_t serialLength) TRT_NOEXCEPT {
+    // This object will be deleted when the network is destroyed, which will
+    // call YoloLayerPlugin::destroy()
+    YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
+    obj->setPluginNamespace(mNamespace.c_str());
+    return obj;
+}
+
+}  // namespace nvinfer1
diff --git a/plugin/yololayer.h b/plugin/yololayer.h
new file mode 100644
index 0000000..b516ad8
--- /dev/null
+++ b/plugin/yololayer.h
@@ -0,0 +1,109 @@
+#pragma once
+#include <string>
+#include <vector>
+#include "NvInfer.h"
+#include "macros.h"
+namespace nvinfer1 {
+class API YoloLayerPlugin : public IPluginV2IOExt {
+   public:
+    YoloLayerPlugin(int classCount, int numberofpoints, float confthreshkeypoints, int netWidth, int netHeight,
+                    int maxOut, bool is_segmentation, bool is_pose, const int* strides, int stridesLength);
+
+    YoloLayerPlugin(const void* data, size_t length);
+    ~YoloLayerPlugin();
+
+    int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+
+    nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) TRT_NOEXCEPT override;
+
+    int initialize() TRT_NOEXCEPT override;
+
+    virtual void terminate() TRT_NOEXCEPT override {}
+
+    virtual size_t getWorkspaceSize(int maxBatchSize) const TRT_NOEXCEPT override { return 0; }
+
+    virtual int enqueue(int batchSize, const void* const* inputs, void* TRT_CONST_ENQUEUE* outputs, void* workspace,
+                        cudaStream_t stream) TRT_NOEXCEPT override;
+
+    virtual size_t getSerializationSize() const TRT_NOEXCEPT override;
+
+    virtual void serialize(void* buffer) const TRT_NOEXCEPT override;
+
+    bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs,
+                                   int nbOutputs) const TRT_NOEXCEPT override {
+        return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
+    }
+
+    const char* getPluginType() const TRT_NOEXCEPT override;
+
+    const char* getPluginVersion() const TRT_NOEXCEPT override;
+
+    void destroy() TRT_NOEXCEPT override;
+
+    IPluginV2IOExt* clone() const TRT_NOEXCEPT override;
+
+    void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override;
+
+    const char* getPluginNamespace() const TRT_NOEXCEPT override;
+
+    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
+                                         int32_t nbInputs) const TRT_NOEXCEPT;
+
+    bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted,
+                                      int nbInputs) const TRT_NOEXCEPT override;
+
+    bool canBroadcastInputAcrossBatch(int inputIndex) const TRT_NOEXCEPT override;
+
+    void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext,
+                         IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
+    void configurePlugin(PluginTensorDesc const* in, int32_t nbInput, PluginTensorDesc const* out,
+                         int32_t nbOutput) TRT_NOEXCEPT override;
+
+    void detachFromContext() TRT_NOEXCEPT override;
+
+   private:
+    void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int mYoloV8netHeight,
+                    int mYoloV8NetWidth, int batchSize);
+    int mThreadCount = 256;
+    const char* mPluginNamespace;
+    int mClassCount;
+    int mNumberofpoints;
+    float mConfthreshkeypoints;
+    int mYoloV8NetWidth;
+    int mYoloV8netHeight;
+    int mMaxOutObject;
+    bool is_segmentation_;
+    bool is_pose_;
+    int* mStrides;
+    int mStridesLength;
+};
+
+class API YoloPluginCreator : public IPluginCreator {
+   public:
+    YoloPluginCreator();
+    ~YoloPluginCreator() override = default;
+
+    const char* getPluginName() const TRT_NOEXCEPT override;
+
+    const char* getPluginVersion() const TRT_NOEXCEPT override;
+
+    const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
+
+    nvinfer1::IPluginV2IOExt* createPlugin(const char* name,
+                                           const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+    nvinfer1::IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData,
+                                                size_t serialLength) TRT_NOEXCEPT override;
+
+    void setPluginNamespace(const char* libNamespace) TRT_NOEXCEPT override { mNamespace = libNamespace; }
+
+    const char* getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
+
+   private:
+    std::string mNamespace;
+    static PluginFieldCollection mFC;
+    static std::vector<PluginField> mPluginAttributes;
+};
+REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
+}  // namespace nvinfer1
diff --git a/src/block.cpp b/src/block.cpp
new file mode 100644
index 0000000..a92a03a
--- /dev/null
+++ b/src/block.cpp
@@ -0,0 +1,273 @@
+#include "block.h"
+#include <assert.h>
+#include <math.h>
+#include <fstream>
+#include <iostream>
+#include "config.h"
+#include "yololayer.h"
+#include "model.h"
+
+
+std::map<std::string, nvinfer1::Weights> loadWeights(const std::string file) {
+    std::cout << "Loading weights: " << file << std::endl;
+    std::map<std::string, nvinfer1::Weights> WeightMap;
+
+    std::ifstream input(file);
+    assert(input.is_open() && "Unable to load weight file. please check if the .wts file path is right!!!!!!");
+
+    int32_t count;
+    input >> count;
+    assert(count > 0 && "Invalid weight map file.");
+
+    while (count--) {
+        nvinfer1::Weights wt{nvinfer1::DataType::kFLOAT, nullptr, 0};
+        uint32_t size;
+
+        std::string name;
+        input >> name >> std::dec >> size;
+        wt.type = nvinfer1::DataType::kFLOAT;
+
+        uint32_t *val = reinterpret_cast<uint32_t *>(malloc(sizeof(val) * size));
+        for (uint32_t x = 0, y = size; x < y; x++) {
+            input >> std::hex >> val[x];
+        }
+        wt.values = val;
+        wt.count = size;
+        WeightMap[name] = wt;
+    }
+    return WeightMap;
+}
+
+nvinfer1::IScaleLayer *addBatchNorm2d(nvinfer1::INetworkDefinition *network,
+                                      std::map<std::string, nvinfer1::Weights> weightMap,
+                                      nvinfer1::ITensor &input, std::string lname, float eps) {
+    float *gamma = (float *) weightMap[lname + ".weight"].values;
+    float *beta = (float *) weightMap[lname + ".bias"].values;
+    float *mean = (float *) weightMap[lname + ".running_mean"].values;
+    float *var = (float *) weightMap[lname + ".running_var"].values;
+    int len = weightMap[lname + ".running_var"].count;
+
+    float *scval = reinterpret_cast<float *>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        scval[i] = gamma[i] / sqrt(var[i] + eps);
+    }
+    nvinfer1::Weights scale{nvinfer1::DataType::kFLOAT, scval, len};
+
+    float *shval = reinterpret_cast<float *>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
+    }
+    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, shval, len};
+
+    float *pval = reinterpret_cast<float *>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        pval[i] = 1.0;
+    }
+    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, pval, len};
+    weightMap[lname + ".scale"] = scale;
+    weightMap[lname + ".shift"] = shift;
+    weightMap[lname + ".power"] = power;
+    nvinfer1::IScaleLayer *output = network->addScale(input, nvinfer1::ScaleMode::kCHANNEL, shift, scale, power);
+    assert(output);
+    return output;
+}
+
+nvinfer1::IElementWiseLayer *convBnSiLU(nvinfer1::INetworkDefinition *network,
+                                        std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor &input,
+                                        int ch, int k, int s, int p, std::string lname) {
+    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::IConvolutionLayer *conv =
+            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
+    assert(conv);
+    conv->setStrideNd(nvinfer1::DimsHW{s, s});
+    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
+
+    nvinfer1::IScaleLayer *bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
+
+    nvinfer1::IActivationLayer *sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
+    nvinfer1::IElementWiseLayer *ew =
+            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
+    assert(ew);
+    return ew;
+}
+
+nvinfer1::ILayer *bottleneck(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights> weightMap,
+                             nvinfer1::ITensor &input, int c1, int c2, bool shortcut, float e, std::string lname) {
+    nvinfer1::IElementWiseLayer *conv1 = convBnSiLU(network, weightMap, input, c2, 3, 1, 1, lname + ".cv1");
+    nvinfer1::IElementWiseLayer *conv2 =
+            convBnSiLU(network, weightMap, *conv1->getOutput(0), c2, 3, 1, 1, lname + ".cv2");
+
+    if (shortcut && c1 == c2) {
+        nvinfer1::IElementWiseLayer *ew =
+                network->addElementWise(input, *conv2->getOutput(0), nvinfer1::ElementWiseOperation::kSUM);
+        return ew;
+    }
+    return conv2;
+}
+
+nvinfer1::IElementWiseLayer *C2F(nvinfer1::INetworkDefinition *network,
+                                 std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor &input, int c1,
+                                 int c2, int n, bool shortcut, float e, std::string lname) {
+    int c_ = (float) c2 * e;
+
+    nvinfer1::IElementWiseLayer *conv1 = convBnSiLU(network, weightMap, input, 2 * c_, 1, 1, 0, lname + ".cv1");
+    nvinfer1::Dims d = conv1->getOutput(0)->getDimensions();
+
+    nvinfer1::ISliceLayer *split1 =
+            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, 0, 0, 0},
+                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
+    nvinfer1::ISliceLayer *split2 =
+            network->addSlice(*conv1->getOutput(0), nvinfer1::Dims4{0, d.d[1] / 2, 0, 0},
+                              nvinfer1::Dims4{d.d[0], d.d[1] / 2, d.d[2], d.d[3]}, nvinfer1::Dims4{1, 1, 1, 1});
+    nvinfer1::ITensor *inputTensor0[] = {split1->getOutput(0), split2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat = network->addConcatenation(inputTensor0, 2);
+    nvinfer1::ITensor *y1 = split2->getOutput(0);
+    for (int i = 0; i < n; i++) {
+        auto *b = bottleneck(network, weightMap, *y1, c_, c_, shortcut, 1.0, lname + ".m." + std::to_string(i));
+        y1 = b->getOutput(0);
+
+        nvinfer1::ITensor *inputTensors[] = {cat->getOutput(0), b->getOutput(0)};
+        cat = network->addConcatenation(inputTensors, 2);
+    }
+
+    nvinfer1::IElementWiseLayer *conv2 =
+            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
+
+    return conv2;
+}
+
+nvinfer1::IElementWiseLayer *C2(nvinfer1::INetworkDefinition *network,
+                                std::map<std::string, nvinfer1::Weights> &weightMap, nvinfer1::ITensor &input, int c1,
+                                int c2, int n, bool shortcut, float e, std::string lname) {
+    assert(network != nullptr);
+    int hidden_channels = static_cast<int>(c2 * e);
+
+    // cv1 branch
+    nvinfer1::IElementWiseLayer *conv1 =
+            convBnSiLU(network, weightMap, input, 2 * hidden_channels, 1, 1, 0, lname + ".cv1");
+    nvinfer1::ITensor *cv1_out = conv1->getOutput(0);
+
+    // Split the output of cv1 into two tensors
+    nvinfer1::Dims dims = cv1_out->getDimensions();
+    nvinfer1::ISliceLayer *split1 =
+            network->addSlice(*cv1_out, nvinfer1::Dims4{0, 0, 0, 0},
+                              nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]},
+                              nvinfer1::Dims4{1, 1, 1, 1});
+    nvinfer1::ISliceLayer *split2 =
+            network->addSlice(*cv1_out, nvinfer1::Dims4{0, dims.d[1] / 2, 0, 0},
+                              nvinfer1::Dims4{dims.d[0], dims.d[1] / 2, dims.d[2], dims.d[3]},
+                              nvinfer1::Dims4{1, 1, 1, 1});
+
+    // Create y1 bottleneck sequence
+    nvinfer1::ITensor *y1 = split1->getOutput(0);
+    for (int i = 0; i < n; ++i) {
+        auto *bottleneck_layer = bottleneck(network, weightMap, *y1, hidden_channels, hidden_channels, shortcut, 1.0,
+                                            lname + ".m." + std::to_string(i));
+        y1 = bottleneck_layer->getOutput(0);  // update 'y1' to be the output of the current bottleneck
+    }
+
+    // Concatenate y1 with the second split of cv1
+    nvinfer1::ITensor *concatInputs[2] = {y1, split2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat = network->addConcatenation(concatInputs, 2);
+
+    // cv2 to produce the final output
+    nvinfer1::IElementWiseLayer *conv2 =
+            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
+
+    return conv2;
+}
+
+nvinfer1::IElementWiseLayer *SPPF(nvinfer1::INetworkDefinition *network,
+                                  std::map<std::string, nvinfer1::Weights> weightMap, nvinfer1::ITensor &input, int c1,
+                                  int c2, int k, std::string lname) {
+    int c_ = c1 / 2;
+    nvinfer1::IElementWiseLayer *conv1 = convBnSiLU(network, weightMap, input, c_, 1, 1, 0, lname + ".cv1");
+    nvinfer1::IPoolingLayer *pool1 =
+            network->addPoolingNd(*conv1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
+    pool1->setStrideNd(nvinfer1::DimsHW{1, 1});
+    pool1->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
+    nvinfer1::IPoolingLayer *pool2 =
+            network->addPoolingNd(*pool1->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
+    pool2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    pool2->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
+    nvinfer1::IPoolingLayer *pool3 =
+            network->addPoolingNd(*pool2->getOutput(0), nvinfer1::PoolingType::kMAX, nvinfer1::DimsHW{k, k});
+    pool3->setStrideNd(nvinfer1::DimsHW{1, 1});
+    pool3->setPaddingNd(nvinfer1::DimsHW{k / 2, k / 2});
+    nvinfer1::ITensor *inputTensors[] = {conv1->getOutput(0), pool1->getOutput(0), pool2->getOutput(0),
+                                         pool3->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat = network->addConcatenation(inputTensors, 4);
+    nvinfer1::IElementWiseLayer *conv2 =
+            convBnSiLU(network, weightMap, *cat->getOutput(0), c2, 1, 1, 0, lname + ".cv2");
+    return conv2;
+}
+
+nvinfer1::IShuffleLayer *DFL(nvinfer1::INetworkDefinition *network, std::map<std::string, nvinfer1::Weights> weightMap,
+                             nvinfer1::ITensor &input, int ch, int grid, int k, int s, int p, std::string lname) {
+
+    nvinfer1::IShuffleLayer *shuffle1 = network->addShuffle(input);
+    shuffle1->setReshapeDimensions(nvinfer1::Dims4{kBatchSize, 4, 16, grid});
+    shuffle1->setSecondTranspose(nvinfer1::Permutation{0, 2, 1, 3});
+    nvinfer1::ISoftMaxLayer *softmax = network->addSoftMax(*shuffle1->getOutput(0));
+    softmax->setAxes(1 << 1);
+
+    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::IConvolutionLayer *conv =
+            network->addConvolutionNd(*softmax->getOutput(0), 1, nvinfer1::DimsHW{1, 1}, weightMap[lname], bias_empty);
+    conv->setStrideNd(nvinfer1::DimsHW{s, s});
+    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
+
+    nvinfer1::IShuffleLayer *shuffle2 = network->addShuffle(*conv->getOutput(0));
+    shuffle2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, 4, grid});
+
+    return shuffle2;
+}
+
+nvinfer1::IPluginV2Layer *addYoLoLayer(nvinfer1::INetworkDefinition *network,
+                                       std::vector<nvinfer1::IConcatenationLayer *> dets, const int *px_arry,
+                                       int px_arry_num, bool is_segmentation, bool is_pose) {
+    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
+    const int netinfo_count = 8;  // Assuming the first 5 elements are for netinfo as per existing code.
+    const int total_count = netinfo_count + px_arry_num;  // Total number of elements for netinfo and px_arry combined.
+
+    std::vector<int> combinedInfo(total_count);
+    // Fill in the first 5 elements as per existing netinfo.
+    combinedInfo[0] = is_pose ? kPoseNumClass : kNumClass;
+    combinedInfo[1] = kNumberOfPoints;
+    combinedInfo[2] = kConfThreshKeypoints;
+    combinedInfo[3] = kInputW;
+    combinedInfo[4] = kInputH;
+    combinedInfo[5] = kMaxNumOutputBbox;
+    combinedInfo[6] = is_segmentation;
+    combinedInfo[7] = is_pose;
+
+    // Copy the contents of px_arry into the combinedInfo vector after the initial 5 elements.
+    std::copy(px_arry, px_arry + px_arry_num, combinedInfo.begin() + netinfo_count);
+
+    // Now let's create the PluginField object to hold this combined information.
+    nvinfer1::PluginField pluginField;
+    pluginField.name = "combinedInfo";  // This can be any name that the plugin will recognize
+    pluginField.data = combinedInfo.data();
+    pluginField.type = nvinfer1::PluginFieldType::kINT32;
+    pluginField.length = combinedInfo.size();
+
+    // Create the PluginFieldCollection to hold the PluginField object.
+    nvinfer1::PluginFieldCollection pluginFieldCollection;
+    pluginFieldCollection.nbFields = 1;  // We have just one field, but it's a combined array
+    pluginFieldCollection.fields = &pluginField;
+
+    // Create the plugin object using the PluginFieldCollection.
+    nvinfer1::IPluginV2 *pluginObject = creator->createPlugin("yololayer", &pluginFieldCollection);
+
+    // We assume that the plugin is to be added onto the network.
+    // Prepare input tensors for the YOLO Layer.
+    std::vector<nvinfer1::ITensor *> inputTensors;
+    for (auto det: dets) {
+        inputTensors.push_back(det->getOutput(0));  // Assuming each IConcatenationLayer has one output tensor.
+    }
+
+    // Add the plugin to the network using the prepared input tensors.
+    nvinfer1::IPluginV2Layer *yoloLayer = network->addPluginV2(inputTensors.data(), inputTensors.size(), *pluginObject);
+
+    return yoloLayer;  // Return the added YOLO layer.
+}
diff --git a/src/calibrator.cpp b/src/calibrator.cpp
new file mode 100644
index 0000000..6202788
--- /dev/null
+++ b/src/calibrator.cpp
@@ -0,0 +1,80 @@
+#include <iostream>
+#include <iterator>
+#include <fstream>
+#include <opencv2/dnn/dnn.hpp>
+#include "calibrator.h"
+#include "cuda_utils.h"
+#include "utils.h"
+
+Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name,
+                                               const char* input_blob_name, bool read_cache)
+    : batchsize_(batchsize)
+    , input_w_(input_w)
+    , input_h_(input_h)
+    , img_idx_(0)
+    , img_dir_(img_dir)
+    , calib_table_name_(calib_table_name)
+    , input_blob_name_(input_blob_name)
+    , read_cache_(read_cache)
+{
+    input_count_ = 3 * input_w * input_h * batchsize;
+    CUDA_CHECK(cudaMalloc(&device_input_, input_count_ * sizeof(float)));
+    read_files_in_dir(img_dir, img_files_);
+}
+
+Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
+{
+    CUDA_CHECK(cudaFree(device_input_));
+}
+
+int Int8EntropyCalibrator2::getBatchSize() const TRT_NOEXCEPT
+{
+    return batchsize_;
+}
+
+bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) TRT_NOEXCEPT
+{
+    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
+        return false;
+    }
+
+    std::vector<cv::Mat> input_imgs_;
+    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
+        std::cout << img_files_[i] << "  " << i << std::endl;
+        cv::Mat temp = cv::imread(img_dir_ + "/" + img_files_[i]);
+        if (temp.empty()){
+            std::cerr << "Fatal error: image cannot open!" << std::endl;
+            return false;
+        }
+        cv::Mat pr_img = preprocess_img(temp, input_w_, input_h_);
+        input_imgs_.push_back(pr_img);
+    }
+    img_idx_ += batchsize_;
+    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);
+    CUDA_CHECK(cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice));
+    assert(!strcmp(names[0], input_blob_name_));
+    bindings[0] = device_input_;
+    return true;
+}
+
+const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) TRT_NOEXCEPT
+{
+    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
+    calib_cache_.clear();
+    std::ifstream input(calib_table_name_, std::ios::binary);
+    input >> std::noskipws;
+    if (read_cache_ && input.good())
+    {
+        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
+    }
+    length = calib_cache_.size();
+    return length ? calib_cache_.data() : nullptr;
+}
+
+void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) TRT_NOEXCEPT
+{
+    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
+    std::ofstream output(calib_table_name_, std::ios::binary);
+    output.write(reinterpret_cast<const char*>(cache), length);
+}
+
diff --git a/src/model.cpp b/src/model.cpp
new file mode 100644
index 0000000..af3301c
--- /dev/null
+++ b/src/model.cpp
@@ -0,0 +1,2050 @@
+#include <math.h>
+#include <iostream>
+
+#include "block.h"
+#include "calibrator.h"
+#include "config.h"
+#include "model.h"
+
+static int get_width(int x, float gw, int max_channels, int divisor = 8) {
+    auto channel = int(ceil((x * gw) / divisor)) * divisor;
+    return channel >= max_channels ? max_channels : channel;
+}
+
+static int get_depth(int x, float gd) {
+    if (x == 1)
+        return 1;
+    int r = round(x * gd);
+    if (x * gd - int(x * gd) == 0.5 && (int(x * gd) % 2) == 0)
+        --r;
+    return std::max<int>(r, 1);
+}
+
+void calculateStrides(nvinfer1::IElementWiseLayer *conv_layers[], int size, int reference_size, int strides[]) {
+    for (int i = 0; i < size; ++i) {
+        nvinfer1::ILayer *layer = conv_layers[i];
+        nvinfer1::Dims dims = layer->getOutput(0)->getDimensions();
+        int feature_map_size = dims.d[2];
+        strides[i] = reference_size / feature_map_size;
+    }
+}
+
+nvinfer1::IHostMemory *buildEngineYolov8Cls(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
+                                            nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+//	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
+    nvinfer1::INetworkDefinition *network = builder->createNetworkV2(
+            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+    int max_channels = 1280;
+    // ****************************************** YOLOV8 INPUT **********************************************
+    nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt,
+                                                nvinfer1::Dims4{kBatchSize, 3, kClsInputH, kClsInputW});
+    assert(data);
+
+    // ***************************************** YOLOV8 BACKBONE ********************************************
+    nvinfer1::IElementWiseLayer *conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    nvinfer1::IElementWiseLayer *conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    // C2 Block (11233)
+    nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    nvinfer1::IElementWiseLayer *conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    // C2 Block Sequence (22466)
+    nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer *conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    // C2 Block Sequence (22466)
+    nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+    nvinfer1::IElementWiseLayer *conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
+    // C2 Block (11233)
+    nvinfer1::IElementWiseLayer *conv8 =
+            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+
+    // ********************************************* YOLOV8 HEAD *********************************************
+
+    auto conv_class = convBnSiLU(network, weightMap, *conv8->getOutput(0), 1280, 1, 1, 0, "model.9.conv");
+    // Adjusted code
+    nvinfer1::Dims dims =
+            conv_class->getOutput(0)->getDimensions();  // Obtain the dimensions of the output of conv_class
+    assert(dims.nbDims == 4);  // Make sure there are exactly 3 dimensions (channels, height, width)
+
+    nvinfer1::IPoolingLayer *pool2 = network->addPoolingNd(*conv_class->getOutput(0), nvinfer1::PoolingType::kAVERAGE,
+                                                           nvinfer1::DimsHW{dims.d[2], dims.d[3]});
+    assert(pool2);
+
+    // Fully connected layer declaration
+    auto shuffle_0 = network->addShuffle(*pool2->getOutput(0));
+    shuffle_0->setReshapeDimensions(nvinfer1::Dims2{kBatchSize, max_channels});
+    auto linear_weight = weightMap["model.9.linear.weight"];
+    auto constant_weight = network->addConstant(nvinfer1::Dims2{kClsNumClass, max_channels}, linear_weight);
+    auto constant_bias = network->addConstant(nvinfer1::Dims2{kBatchSize, kClsNumClass},
+                                              weightMap["model.9.linear.bias"]);
+    auto linear_matrix_multipy = network->addMatrixMultiply(*shuffle_0->getOutput(0),
+                                                            nvinfer1::MatrixOperation::kNONE,
+                                                            *constant_weight->getOutput(0),
+                                                            nvinfer1::MatrixOperation::kTRANSPOSE);
+    auto yolo = network->addElementWise(*linear_matrix_multipy->getOutput(0), *constant_bias->getOutput(0),
+                                        nvinfer1::ElementWiseOperation::kSUM);
+    assert(yolo);
+
+    // Set the name for the output tensor and mark it as network output
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    // Set the maximum batch size and workspace size
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));
+
+    // Configuration according to the precision mode being used
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform supports int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto *calibrator = new Int8EntropyCalibrator2(1, kClsInputW, kClsInputH, kInputQuantizationFolder,
+                                                  "int8calib.table", kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    // Begin building the engine; this may take a while
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    // Cleanup the network definition and allocated weights
+    delete network;
+
+    for (auto &mem: weightMap) {
+        free((void *) (mem.second.values));
+    }
+    return serialized_model;
+}
+
+nvinfer1::IHostMemory *buildEngineYolov8Det(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
+                                            nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw,
+                                            int &max_channels) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+//	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
+    nvinfer1::INetworkDefinition *network = builder->createNetworkV2(
+            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+
+    /*******************************************************************************************************
+    ******************************************  YOLOV8 INPUT  **********************************************
+    *******************************************************************************************************/
+    nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
+    assert(data);
+
+    /*******************************************************************************************************
+    *****************************************  YOLOV8 BACKBONE  ********************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer *conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    nvinfer1::IElementWiseLayer *conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    // 11233
+    nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    nvinfer1::IElementWiseLayer *conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    // 22466
+    nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer *conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    // 22466
+    nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+    nvinfer1::IElementWiseLayer *conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
+    // 11233
+    nvinfer1::IElementWiseLayer *conv8 =
+            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+    nvinfer1::IElementWiseLayer *conv9 =
+            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
+                 get_width(1024, gw, max_channels), 5, "model.9");
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 HEAD  ********************************************
+    *******************************************************************************************************/
+    float scale[] = {1.0, 1.0, 2.0, 2.0};
+    nvinfer1::IResizeLayer *upsample10 = network->addResize(*conv9->getOutput(0));
+    assert(upsample10);
+    upsample10->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample10->setScales(scale, 4);
+
+    nvinfer1::ITensor *inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat11 = network->addConcatenation(inputTensor11, 2);
+
+    nvinfer1::IElementWiseLayer *conv12 =
+            C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");
+
+    nvinfer1::IResizeLayer *upsample13 = network->addResize(*conv12->getOutput(0));
+    assert(upsample13);
+    upsample13->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample13->setScales(scale, 4);
+
+    nvinfer1::ITensor *inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat14 = network->addConcatenation(inputTensor14, 2);
+
+    nvinfer1::IElementWiseLayer *conv15 =
+            C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels),
+                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
+    nvinfer1::IElementWiseLayer *conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0),
+                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.16");
+    nvinfer1::ITensor *inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat17 = network->addConcatenation(inputTensor17, 2);
+    nvinfer1::IElementWiseLayer *conv18 =
+            C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
+    nvinfer1::IElementWiseLayer *conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
+                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.19");
+    nvinfer1::ITensor *inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat20 = network->addConcatenation(inputTensor20, 2);
+    nvinfer1::IElementWiseLayer *conv21 =
+            C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 OUTPUT  ******************************************
+    *******************************************************************************************************/
+    int base_in_channel = (gw == 1.25) ? 80 : 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);
+
+    // output0
+    nvinfer1::IElementWiseLayer *conv22_cv2_0_0 =
+            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0");
+    nvinfer1::IElementWiseLayer *conv22_cv2_0_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1");
+    nvinfer1::IConvolutionLayer *conv22_cv2_0_2 =
+            network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]);
+    conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer *conv22_cv3_0_0 =
+            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0");
+    nvinfer1::IElementWiseLayer *conv22_cv3_0_1 =
+            convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0),
+                       base_out_channel, 3, 1, 1, "model.22.cv3.0.1");
+    nvinfer1::IConvolutionLayer *conv22_cv3_0_2 =
+            network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]);
+    conv22_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_0 = network->addConcatenation(inputTensor22_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer *conv22_cv2_1_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0");
+    nvinfer1::IElementWiseLayer *conv22_cv2_1_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1");
+    nvinfer1::IConvolutionLayer *conv22_cv2_1_2 =
+            network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]);
+    conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer *conv22_cv3_1_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0");
+    nvinfer1::IElementWiseLayer *conv22_cv3_1_1 =
+            convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0),
+                       base_out_channel, 3, 1, 1, "model.22.cv3.1.1");
+    nvinfer1::IConvolutionLayer *conv22_cv3_1_2 =
+            network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]);
+    conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_1 = network->addConcatenation(inputTensor22_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer *conv22_cv2_2_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0");
+    nvinfer1::IElementWiseLayer *conv22_cv2_2_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1");
+    nvinfer1::IConvolutionLayer *conv22_cv2_2_2 =
+            network->addConvolutionNd(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]);
+    nvinfer1::IElementWiseLayer *conv22_cv3_2_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0");
+    nvinfer1::IElementWiseLayer *conv22_cv3_2_1 =
+            convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0),
+                       base_out_channel, 3, 1, 1, "model.22.cv3.2.1");
+    nvinfer1::IConvolutionLayer *conv22_cv3_2_2 =
+            network->addConvolutionNd(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]);
+    nvinfer1::ITensor *inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_2 = network->addConcatenation(inputTensor22_2, 2);
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 DETECT  ******************************************
+    *******************************************************************************************************/
+
+    nvinfer1::IElementWiseLayer *conv_layers[] = {conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    nvinfer1::IShuffleLayer *shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
+    shuffle22_0->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer *split22_0_0 = network->addSlice(
+            *shuffle22_0->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split22_0_1 = network->addSlice(
+            *shuffle22_0->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+
+    nvinfer1::IShuffleLayer *dfl22_0 =
+            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 2);
+    cat22_dfl_0->setAxis(1);
+
+    nvinfer1::IShuffleLayer *shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
+    shuffle22_1->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer *split22_1_0 = network->addSlice(
+            *shuffle22_1->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split22_1_1 = network->addSlice(
+            *shuffle22_1->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl22_1 =
+            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 2);
+    cat22_dfl_1->setAxis(1);
+
+    nvinfer1::IShuffleLayer *shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
+    shuffle22_2->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer *split22_2_0 = network->addSlice(
+            *shuffle22_2->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split22_2_1 = network->addSlice(
+            *shuffle22_2->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl22_2 =
+            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 2);
+    cat22_dfl_2->setAxis(1);
+
+    nvinfer1::IPluginV2Layer *yolo =
+            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer *>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
+                         strides, stridesLength, false, false);
+
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto *calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
+                                                  kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto &mem: weightMap) {
+        free((void *) (mem.second.values));
+    }
+    return serialized_model;
+}
+
+nvinfer1::IHostMemory *
+buildEngineYolov8DetP6(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt,
+                       const std::string &wts_path, float &gd, float &gw, int &max_channels) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+//	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
+    nvinfer1::INetworkDefinition *network = builder->createNetworkV2(
+            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+    /*******************************************************************************************************
+    ******************************************  YOLOV8 INPUT  **********************************************
+    *******************************************************************************************************/
+    nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
+    assert(data);
+    /*******************************************************************************************************
+    *****************************************  YOLOV8 BACKBONE  ********************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer *conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    nvinfer1::IElementWiseLayer *conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    // 11233
+    nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    nvinfer1::IElementWiseLayer *conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    // 22466
+    nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer *conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    // 22466
+    nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+
+    nvinfer1::IElementWiseLayer *conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7");
+    nvinfer1::IElementWiseLayer *conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels),
+                                             get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+
+    nvinfer1::IElementWiseLayer *conv9 =
+            convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9");
+    nvinfer1::IElementWiseLayer *conv10 =
+            C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10");
+
+    nvinfer1::IElementWiseLayer *conv11 =
+            SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels),
+                 get_width(1024, gw, max_channels), 5, "model.11");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 HEAD  ********************************************
+    *******************************************************************************************************/
+    // Head
+    float scale[] = {1.0, 1.0, 2.0, 2.0};  // scale used for upsampling
+
+    // P5
+    nvinfer1::IResizeLayer *upsample12 = network->addResize(*conv11->getOutput(0));
+    upsample12->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample12->setScales(scale, 4);
+    nvinfer1::ITensor *concat13_inputs[] = {upsample12->getOutput(0), conv8->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat13 = network->addConcatenation(concat13_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv14 =
+            C2(network, weightMap, *concat13->getOutput(0), get_width(768, gw, max_channels),
+               get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.14");
+
+    // P4
+    nvinfer1::IResizeLayer *upsample15 = network->addResize(*conv14->getOutput(0));
+    upsample15->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample15->setScales(scale, 4);
+    nvinfer1::ITensor *concat16_inputs[] = {upsample15->getOutput(0), conv6->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat16 = network->addConcatenation(concat16_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv17 =
+            C2(network, weightMap, *concat16->getOutput(0), get_width(512, gw, max_channels),
+               get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.17");
+
+    // P3
+    nvinfer1::IResizeLayer *upsample18 = network->addResize(*conv17->getOutput(0));
+    upsample18->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample18->setScales(scale, 4);
+    nvinfer1::ITensor *concat19_inputs[] = {upsample18->getOutput(0), conv4->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat19 = network->addConcatenation(concat19_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv20 =
+            C2(network, weightMap, *concat19->getOutput(0), get_width(256, gw, max_channels),
+               get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.20");
+
+    // Additional layers for P4, P5, P6
+    // P4/16-medium
+    nvinfer1::IElementWiseLayer *conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0),
+                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.21");
+    nvinfer1::ITensor *concat22_inputs[] = {conv21->getOutput(0), conv17->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat22 = network->addConcatenation(concat22_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv23 =
+            C2(network, weightMap, *concat22->getOutput(0), get_width(512, gw, max_channels),
+               get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.23");
+
+    // P5/32-large
+    nvinfer1::IElementWiseLayer *conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0),
+                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.24");
+    nvinfer1::ITensor *concat25_inputs[] = {conv24->getOutput(0), conv14->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat25 = network->addConcatenation(concat25_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv26 =
+            C2(network, weightMap, *concat25->getOutput(0), get_width(768, gw, max_channels),
+               get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.26");
+
+    // P6/64-xlarge
+    nvinfer1::IElementWiseLayer *conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0),
+                                                     get_width(768, gw, max_channels), 3, 2, 1, "model.27");
+    nvinfer1::ITensor *concat28_inputs[] = {conv27->getOutput(0), conv11->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat28 = network->addConcatenation(concat28_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv29 =
+            C2(network, weightMap, *concat28->getOutput(0), get_width(1024, gw, max_channels),
+               get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 OUTPUT  ******************************************
+    *******************************************************************************************************/
+    int base_in_channel = (gw == 1.25) ? 80 : 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);
+
+    // output0
+    nvinfer1::IElementWiseLayer *conv30_cv2_0_0 =
+            convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.0");
+    nvinfer1::IElementWiseLayer *conv30_cv2_0_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.1");
+    nvinfer1::IConvolutionLayer *conv30_cv2_0_2 =
+            network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]);
+    conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
+    nvinfer1::IElementWiseLayer *conv30_cv3_0_0 =
+            convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0");
+    nvinfer1::IElementWiseLayer *conv30_cv3_0_1 =
+            convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1,
+                       "model.30.cv3.0.1");
+    nvinfer1::IConvolutionLayer *conv30_cv3_0_2 =
+            network->addConvolutionNd(*conv30_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv3.0.2.weight"], weightMap["model.30.cv3.0.2.bias"]);
+    conv30_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor30_0[] = {conv30_cv2_0_2->getOutput(0), conv30_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_0 = network->addConcatenation(inputTensor30_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer *conv30_cv2_1_0 =
+            convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.0");
+    nvinfer1::IElementWiseLayer *conv30_cv2_1_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.1");
+    nvinfer1::IConvolutionLayer *conv30_cv2_1_2 =
+            network->addConvolutionNd(*conv30_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv2.1.2.weight"], weightMap["model.30.cv2.1.2.bias"]);
+    conv30_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
+    nvinfer1::IElementWiseLayer *conv30_cv3_1_0 =
+            convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.0");
+    nvinfer1::IElementWiseLayer *conv30_cv3_1_1 =
+            convBnSiLU(network, weightMap, *conv30_cv3_1_0->getOutput(0), base_out_channel, 3, 1, 1,
+                       "model.30.cv3.1.1");
+    nvinfer1::IConvolutionLayer *conv30_cv3_1_2 =
+            network->addConvolutionNd(*conv30_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv3.1.2.weight"], weightMap["model.30.cv3.1.2.bias"]);
+    conv30_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor30_1[] = {conv30_cv2_1_2->getOutput(0), conv30_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_1 = network->addConcatenation(inputTensor30_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer *conv30_cv2_2_0 =
+            convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.0");
+    nvinfer1::IElementWiseLayer *conv30_cv2_2_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.1");
+    nvinfer1::IConvolutionLayer *conv30_cv2_2_2 =
+            network->addConvolutionNd(*conv30_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv2.2.2.weight"], weightMap["model.30.cv2.2.2.bias"]);
+    conv30_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
+    nvinfer1::IElementWiseLayer *conv30_cv3_2_0 =
+            convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.0");
+    nvinfer1::IElementWiseLayer *conv30_cv3_2_1 = convBnSiLU(network, weightMap, *conv30_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.30.cv3.2.1");
+    nvinfer1::IConvolutionLayer *conv30_cv3_2_2 =
+            network->addConvolutionNd(*conv30_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv3.2.2.weight"], weightMap["model.30.cv3.2.2.bias"]);
+    conv30_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor30_2[] = {conv30_cv2_2_2->getOutput(0), conv30_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_2 = network->addConcatenation(inputTensor30_2, 2);
+
+    // output3
+    nvinfer1::IElementWiseLayer *conv30_cv2_3_0 =
+            convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.0");
+    nvinfer1::IElementWiseLayer *conv30_cv2_3_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.1");
+    nvinfer1::IConvolutionLayer *conv30_cv2_3_2 =
+            network->addConvolutionNd(*conv30_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv2.3.2.weight"], weightMap["model.30.cv2.3.2.bias"]);
+    conv30_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
+    nvinfer1::IElementWiseLayer *conv30_cv3_3_0 =
+            convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.0");
+    nvinfer1::IElementWiseLayer *conv30_cv3_3_1 = convBnSiLU(network, weightMap, *conv30_cv3_3_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.30.cv3.3.1");
+    nvinfer1::IConvolutionLayer *conv30_cv3_3_2 =
+            network->addConvolutionNd(*conv30_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv3.3.2.weight"], weightMap["model.30.cv3.3.2.bias"]);
+    conv30_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor30_3[] = {conv30_cv2_3_2->getOutput(0), conv30_cv3_3_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_3 = network->addConcatenation(inputTensor30_3, 2);
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 DETECT  ******************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer *conv_layers[] = {conv3, conv5, conv7, conv9};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    // P3 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer *shuffle30_0 =
+            network->addShuffle(*cat30_0->getOutput(0));  // Reusing the previous cat30_0 as P3 concatenation layer
+    shuffle30_0->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer *split30_0_0 = network->addSlice(
+            *shuffle30_0->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split30_0_1 = network->addSlice(
+            *shuffle30_0->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl30_0 =
+            DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 2);
+    cat30_dfl_0->setAxis(1);
+
+    // P4 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer *shuffle30_1 =
+            network->addShuffle(*cat30_1->getOutput(0));  // Reusing the previous cat30_1 as P4 concatenation layer
+    shuffle30_1->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer *split30_1_0 = network->addSlice(
+            *shuffle30_1->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split30_1_1 = network->addSlice(
+            *shuffle30_1->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl30_1 =
+            DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 2);
+    cat30_dfl_1->setAxis(1);
+
+    // P5 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer *shuffle30_2 =
+            network->addShuffle(*cat30_2->getOutput(0));  // Reusing the previous cat30_2 as P5 concatenation layer
+    shuffle30_2->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer *split30_2_0 = network->addSlice(
+            *shuffle30_2->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split30_2_1 = network->addSlice(
+            *shuffle30_2->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl30_2 =
+            DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 2);
+    cat30_dfl_2->setAxis(1);
+
+    // P6 processing steps
+    nvinfer1::IShuffleLayer *shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0));
+    shuffle30_3->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
+    nvinfer1::ISliceLayer *split30_3_0 = network->addSlice(
+            *shuffle30_3->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[3]) * (kInputW / strides[3])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split30_3_1 = network->addSlice(
+            *shuffle30_3->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl30_3 =
+            DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 2);
+    cat30_dfl_3->setAxis(1);
+
+    nvinfer1::IPluginV2Layer *yolo = addYoLoLayer(
+            network, std::vector<nvinfer1::IConcatenationLayer *>{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3},
+            strides, stridesLength, false, false);
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto *calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
+                                                  kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto &mem: weightMap) {
+        free((void *) (mem.second.values));
+    }
+    return serialized_model;
+}
+
+nvinfer1::IHostMemory *
+buildEngineYolov8DetP2(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt,
+                       const std::string &wts_path, float &gd, float &gw, int &max_channels) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+//	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
+    nvinfer1::INetworkDefinition *network = builder->createNetworkV2(
+            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+
+    /*******************************************************************************************************
+    ******************************************  YOLOV8 INPUT  **********************************************
+    *******************************************************************************************************/
+    nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
+    assert(data);
+
+    /*******************************************************************************************************
+    *****************************************  YOLOV8 BACKBONE  ********************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer *conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    nvinfer1::IElementWiseLayer *conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    // 11233
+    nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    nvinfer1::IElementWiseLayer *conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    // 22466
+    nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer *conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    // 22466
+    nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+    nvinfer1::IElementWiseLayer *conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
+    // 11233
+    nvinfer1::IElementWiseLayer *conv8 =
+            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+    nvinfer1::IElementWiseLayer *conv9 =
+            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
+                 get_width(1024, gw, max_channels), 5, "model.9");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 HEAD  ********************************************
+    *******************************************************************************************************/
+    // Head
+    float scale[] = {1.0, 1.0, 2.0, 2.0};  // scale used for upsampling
+
+    // P4
+    nvinfer1::IResizeLayer *upsample10 = network->addResize(
+            *conv9->getOutput(0));  // Assuming conv9 is the last layer of the backbone as per P5 in your first section.
+    upsample10->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample10->setScales(scale, 4);
+    nvinfer1::ITensor *concat11_inputs[] = {
+            upsample10->getOutput(0),
+            conv6->getOutput(0)};  // Assuming conv6 corresponds to "backbone P4" as per your pseudocode
+    nvinfer1::IConcatenationLayer *concat11 = network->addConcatenation(concat11_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv12 =
+            C2F(network, weightMap, *concat11->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");
+
+    // P3
+    nvinfer1::IResizeLayer *upsample13 = network->addResize(*conv12->getOutput(0));
+    upsample13->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample13->setScales(scale, 4);
+    nvinfer1::ITensor *concat14_inputs[] = {upsample13->getOutput(0),
+                                            conv4->getOutput(0)};  // Assuming conv4 corresponds to "backbone P3"
+    nvinfer1::IConcatenationLayer *concat14 = network->addConcatenation(concat14_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv15 =
+            C2F(network, weightMap, *concat14->getOutput(0), get_width(256, gw, max_channels),
+                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
+
+    // P2
+    nvinfer1::IResizeLayer *upsample16 = network->addResize(*conv15->getOutput(0));
+    upsample16->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample16->setScales(scale, 4);
+    nvinfer1::ITensor *concat17_inputs[] = {upsample16->getOutput(0),
+                                            conv2->getOutput(0)};  // Assuming conv2 corresponds to "backbone P2"
+    nvinfer1::IConcatenationLayer *concat17 = network->addConcatenation(concat17_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv18 =
+            C2F(network, weightMap, *concat17->getOutput(0), get_width(128, gw, max_channels),
+                get_width(128, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
+
+    // Additional layers for P3, P4, P5
+    // Downsample and concatenate for P3
+    nvinfer1::IElementWiseLayer *conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
+                                                     get_width(128, gw, max_channels), 3, 2, 1, "model.19");
+    nvinfer1::ITensor *concat20_inputs[] = {
+            conv19->getOutput(0), conv15->getOutput(0)};  // concatenate with higher-resolution feature map from P3
+    nvinfer1::IConcatenationLayer *concat20 = network->addConcatenation(concat20_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv21 =
+            C2F(network, weightMap, *concat20->getOutput(0), get_width(256, gw, max_channels),
+                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
+
+    // Downsample and concatenate for P4
+    nvinfer1::IElementWiseLayer *conv22 = convBnSiLU(network, weightMap, *conv21->getOutput(0),
+                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.22");
+    nvinfer1::ITensor *concat23_inputs[] = {
+            conv22->getOutput(0), conv12->getOutput(0)};  // concatenate with higher-resolution feature map from P4
+    nvinfer1::IConcatenationLayer *concat23 = network->addConcatenation(concat23_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv24 =
+            C2F(network, weightMap, *concat23->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.24");
+
+    // Downsample and concatenate for P5
+    nvinfer1::IElementWiseLayer *conv25 = convBnSiLU(network, weightMap, *conv24->getOutput(0),
+                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.25");
+    nvinfer1::ITensor *concat26_inputs[] = {
+            conv25->getOutput(0), conv9->getOutput(0)};  // concatenate with higher-resolution feature map from P5
+    nvinfer1::IConcatenationLayer *concat26 = network->addConcatenation(concat26_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv27 =
+            C2F(network, weightMap, *concat26->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.27");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 OUTPUT  ******************************************
+    *******************************************************************************************************/
+//    int ch_0 = conv18->getOutput(0)->getDimensions().d[1];
+//    int base_in_channel = std::max(16, std::max(ch_0 / 4, 64));
+//    int base_out_channel = std::max(ch_0, std::min(kNumClass, 100));
+    int base_in_channel = 64;
+    int base_out_channel = (gw == 0.25) ? std::max(32, std::min(kNumClass, 100)) : get_width(128, gw, max_channels);
+
+    // output0
+    nvinfer1::IElementWiseLayer *conv28_cv2_0_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.0");
+    nvinfer1::IElementWiseLayer *conv28_cv2_0_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.0.1");
+    nvinfer1::IConvolutionLayer *conv28_cv2_0_2 =
+            network->addConvolutionNd(*conv28_cv2_0_1->getOutput(0), base_in_channel, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv2.0.2.weight"], weightMap["model.28.cv2.0.2.bias"]);
+    conv28_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
+    nvinfer1::IElementWiseLayer *conv28_cv3_0_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.0.0");
+    nvinfer1::IElementWiseLayer *conv28_cv3_0_1 =
+            convBnSiLU(network, weightMap, *conv28_cv3_0_0->getOutput(0), base_out_channel, 3, 1, 1,
+                       "model.28.cv3.0.1");
+    nvinfer1::IConvolutionLayer *conv28_cv3_0_2 =
+            network->addConvolutionNd(*conv28_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv3.0.2.weight"], weightMap["model.28.cv3.0.2.bias"]);
+    conv28_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor28_0[] = {conv28_cv2_0_2->getOutput(0), conv28_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat28_0 = network->addConcatenation(inputTensor28_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer *conv28_cv2_1_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.0");
+    nvinfer1::IElementWiseLayer *conv28_cv2_1_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.1.1");
+    nvinfer1::IConvolutionLayer *conv28_cv2_1_2 =
+            network->addConvolutionNd(*conv28_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv2.1.2.weight"], weightMap["model.28.cv2.1.2.bias"]);
+    conv28_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
+    nvinfer1::IElementWiseLayer *conv28_cv3_1_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.1.0");
+    nvinfer1::IElementWiseLayer *conv28_cv3_1_1 = convBnSiLU(network, weightMap, *conv28_cv3_1_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.28.cv3.1.1");
+    nvinfer1::IConvolutionLayer *conv28_cv3_1_2 =
+            network->addConvolutionNd(*conv28_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv3.1.2.weight"], weightMap["model.28.cv3.1.2.bias"]);
+    conv28_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor28_1[] = {conv28_cv2_1_2->getOutput(0), conv28_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat28_1 = network->addConcatenation(inputTensor28_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer *conv28_cv2_2_0 =
+            convBnSiLU(network, weightMap, *conv24->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.0");
+    nvinfer1::IElementWiseLayer *conv28_cv2_2_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.2.1");
+    nvinfer1::IConvolutionLayer *conv28_cv2_2_2 =
+            network->addConvolutionNd(*conv28_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv2.2.2.weight"], weightMap["model.28.cv2.2.2.bias"]);
+    conv28_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
+    nvinfer1::IElementWiseLayer *conv28_cv3_2_0 =
+            convBnSiLU(network, weightMap, *conv24->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.2.0");
+    nvinfer1::IElementWiseLayer *conv28_cv3_2_1 = convBnSiLU(network, weightMap, *conv28_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.28.cv3.2.1");
+    nvinfer1::IConvolutionLayer *conv28_cv3_2_2 =
+            network->addConvolutionNd(*conv28_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv3.2.2.weight"], weightMap["model.28.cv3.2.2.bias"]);
+    conv28_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor28_2[] = {conv28_cv2_2_2->getOutput(0), conv28_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat28_2 = network->addConcatenation(inputTensor28_2, 2);
+
+    // output3
+    nvinfer1::IElementWiseLayer *conv28_cv2_3_0 =
+            convBnSiLU(network, weightMap, *conv27->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.0");
+    nvinfer1::IElementWiseLayer *conv28_cv2_3_1 =
+            convBnSiLU(network, weightMap, *conv28_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.28.cv2.3.1");
+    nvinfer1::IConvolutionLayer *conv28_cv2_3_2 =
+            network->addConvolutionNd(*conv28_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv2.3.2.weight"], weightMap["model.28.cv2.3.2.bias"]);
+    conv28_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
+    nvinfer1::IElementWiseLayer *conv28_cv3_3_0 =
+            convBnSiLU(network, weightMap, *conv27->getOutput(0), base_out_channel, 3, 1, 1, "model.28.cv3.3.0");
+    nvinfer1::IElementWiseLayer *conv28_cv3_3_1 =
+            convBnSiLU(network, weightMap, *conv28_cv3_3_0->getOutput(0),
+                       base_out_channel, 3, 1, 1, "model.28.cv3.3.1");
+    nvinfer1::IConvolutionLayer *conv28_cv3_3_2 =
+            network->addConvolutionNd(*conv28_cv3_3_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.28.cv3.3.2.weight"], weightMap["model.28.cv3.3.2.bias"]);
+    conv28_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv28_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor28_3[] = {conv28_cv2_3_2->getOutput(0), conv28_cv3_3_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat28_3 = network->addConcatenation(inputTensor28_3, 2);
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 DETECT  ******************************************
+    *******************************************************************************************************/
+
+    nvinfer1::IElementWiseLayer *conv_layers[] = {conv1, conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    // P2 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer *shuffle28_0 = network->addShuffle(*cat28_0->getOutput(0));
+    shuffle28_0->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer *split28_0_0 = network->addSlice(
+            *shuffle28_0->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split28_0_1 = network->addSlice(
+            *shuffle28_0->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl28_0 =
+            DFL(network, weightMap, *split28_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor28_dfl_0[] = {dfl28_0->getOutput(0), split28_0_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat28_dfl_0 = network->addConcatenation(inputTensor28_dfl_0, 2);
+    cat28_dfl_0->setAxis(1);
+
+    // P3 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer *shuffle28_1 = network->addShuffle(*cat28_1->getOutput(0));
+    shuffle28_1->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer *split28_1_0 = network->addSlice(
+            *shuffle28_1->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split28_1_1 = network->addSlice(
+            *shuffle28_1->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl28_1 =
+            DFL(network, weightMap, *split28_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor28_dfl_1[] = {dfl28_1->getOutput(0), split28_1_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat28_dfl_1 = network->addConcatenation(inputTensor28_dfl_1, 2);
+    cat28_dfl_1->setAxis(1);
+
+    // P4 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer *shuffle28_2 = network->addShuffle(*cat28_2->getOutput(0));
+    shuffle28_2->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer *split28_2_0 = network->addSlice(
+            *shuffle28_2->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split28_2_1 = network->addSlice(
+            *shuffle28_2->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl28_2 =
+            DFL(network, weightMap, *split28_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor28_dfl_2[] = {dfl28_2->getOutput(0), split28_2_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat28_dfl_2 = network->addConcatenation(inputTensor28_dfl_2, 2);
+    cat28_dfl_2->setAxis(1);
+
+    // P5 processing steps
+    nvinfer1::IShuffleLayer *shuffle28_3 = network->addShuffle(*cat28_3->getOutput(0));
+    shuffle28_3->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
+    nvinfer1::ISliceLayer *split28_3_0 = network->addSlice(
+            *shuffle28_3->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[3]) * (kInputW / strides[3])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split28_3_1 = network->addSlice(
+            *shuffle28_3->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[3]) * (kInputW / strides[3])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl28_3 =
+            DFL(network, weightMap, *split28_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
+                1, 0, "model.28.dfl.conv.weight");
+    nvinfer1::ITensor *inputTensor28_dfl_3[] = {dfl28_3->getOutput(0), split28_3_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat28_dfl_3 = network->addConcatenation(inputTensor28_dfl_3, 2);
+    cat28_dfl_3->setAxis(1);
+
+    nvinfer1::IPluginV2Layer *yolo = addYoLoLayer(
+            network, std::vector<nvinfer1::IConcatenationLayer *>{cat28_dfl_0, cat28_dfl_1, cat28_dfl_2, cat28_dfl_3},
+            strides, stridesLength, false, false);
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto *calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
+                                                  kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto &mem: weightMap) {
+        free((void *) (mem.second.values));
+    }
+    return serialized_model;
+}
+
+static nvinfer1::IElementWiseLayer *convBnSiLUProto(nvinfer1::INetworkDefinition *network,
+                                                    std::map<std::string, nvinfer1::Weights> weightMap,
+                                                    nvinfer1::ITensor &input,
+                                                    int ch, int k, int s, int p, std::string lname) {
+    nvinfer1::Weights bias_empty{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    nvinfer1::IConvolutionLayer *conv =
+            network->addConvolutionNd(input, ch, nvinfer1::DimsHW{k, k}, weightMap[lname + ".conv.weight"], bias_empty);
+    assert(conv);
+    conv->setStrideNd(nvinfer1::DimsHW{s, s});
+    conv->setPaddingNd(nvinfer1::DimsHW{p, p});
+    conv->setName((lname + ".conv").c_str());
+
+    nvinfer1::IScaleLayer *bn = addBatchNorm2d(network, weightMap, *conv->getOutput(0), lname + ".bn", 1e-3);
+    bn->setName((lname + ".bn").c_str());
+    // This concat operator is not used for calculation, in order to prevent the operator fusion unrealized error when int8 is quantized.
+    // Error Code 10: Internal Error (Could not find any implementation for node
+    // model.22.proto.cv3.conv + model.22.proto.cv3.sigmoid + PWN(PWN((Unnamed Layer* 353) [Activation]), PWN(model.22.proto.cv3.silu)).)
+
+#if defined(USE_INT8)
+    nvinfer1::ITensor *inputTensors[] = {bn->getOutput(0)};
+    auto concat = network->addConcatenation(inputTensors, 1);
+    nvinfer1::IActivationLayer *sigmoid = network->addActivation(*concat->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
+    assert(sigmoid);
+    bn->setName((lname + ".sigmoid").c_str());
+    nvinfer1::IElementWiseLayer *ew =
+            network->addElementWise(*concat->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
+    assert(ew);
+    ew->setName((lname + ".silu").c_str());
+#else
+    nvinfer1::IActivationLayer *sigmoid = network->addActivation(*bn->getOutput(0), nvinfer1::ActivationType::kSIGMOID);
+    assert(sigmoid);
+    bn->setName((lname + ".sigmoid").c_str());
+    nvinfer1::IElementWiseLayer *ew =
+            network->addElementWise(*bn->getOutput(0), *sigmoid->getOutput(0), nvinfer1::ElementWiseOperation::kPROD);
+    assert(ew);
+    ew->setName((lname + ".silu").c_str());
+#endif
+    return ew;
+}
+
+static nvinfer1::IElementWiseLayer *Proto(nvinfer1::INetworkDefinition *network,
+                                          std::map<std::string, nvinfer1::Weights> &weightMap, nvinfer1::ITensor &input,
+                                          std::string lname, float gw, int max_channels) {
+    int mid_channel = get_width(256, gw, max_channels);
+    auto cv1 = convBnSiLU(network, weightMap, input, mid_channel, 3, 1, 1, "model.22.proto.cv1");
+    float *convTranpsose_bais = (float *) weightMap["model.22.proto.upsample.bias"].values;
+    int convTranpsose_bais_len = weightMap["model.22.proto.upsample.bias"].count;
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, convTranpsose_bais, convTranpsose_bais_len};
+    auto convTranpsose = network->addDeconvolutionNd(*cv1->getOutput(0), mid_channel, nvinfer1::DimsHW{2, 2},
+                                                     weightMap["model.22.proto.upsample.weight"], bias);
+    assert(convTranpsose);
+    convTranpsose->setStrideNd(nvinfer1::DimsHW{2, 2});
+    auto cv2 = convBnSiLU(network, weightMap, *convTranpsose->getOutput(0), mid_channel, 3, 1, 1,
+                          "model.22.proto.cv2");
+    auto cv3 = convBnSiLUProto(network, weightMap, *cv2->getOutput(0), 32, 1, 1, 0, "model.22.proto.cv3");
+    assert(cv3);
+    return cv3;
+}
+
+static nvinfer1::IShuffleLayer *cv4_conv_combined(nvinfer1::INetworkDefinition *network,
+                                                  std::map<std::string, nvinfer1::Weights> &weightMap,
+                                                  nvinfer1::ITensor &input, std::string lname, int grid_shape, float gw,
+                                                  std::string algo_type) {
+    int mid_channle = 0;
+    int output_channel = 0;
+
+    if (algo_type == "seg") {
+        if (gw == 0.25 || gw == 0.5) {
+            mid_channle = 32;
+        } else if (gw == 0.75) {
+            mid_channle = 48;
+        } else if (gw == 1.00) {
+            mid_channle = 64;
+        } else if (gw == 1.25) {
+            mid_channle = 80;
+        }
+
+        output_channel = 32;
+    } else if (algo_type == "pose") {
+        std::string bn_weight_key = lname + ".0.bn.weight";
+        mid_channle = weightMap[bn_weight_key].count;
+        output_channel = kNumberOfPoints * 3;
+    }
+
+    auto cv0 = convBnSiLU(network, weightMap, input, mid_channle, 3, 1, 1, lname + ".0");
+    auto cv1 = convBnSiLU(network, weightMap, *cv0->getOutput(0), mid_channle, 3, 1, 1, lname + ".1");
+    float *cv2_bais_value = (float *) weightMap[lname + ".2" + ".bias"].values;
+    int cv2_bais_len = weightMap[lname + ".2" + ".bias"].count;
+    nvinfer1::Weights cv2_bais{nvinfer1::DataType::kFLOAT, cv2_bais_value, cv2_bais_len};
+    auto cv2 = network->addConvolutionNd(*cv1->getOutput(0), output_channel, nvinfer1::DimsHW{1, 1},
+                                         weightMap[lname + ".2" + ".weight"], cv2_bais);
+    cv2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    nvinfer1::IShuffleLayer *cv2_shuffle = network->addShuffle(*cv2->getOutput(0));
+    cv2_shuffle->setReshapeDimensions(nvinfer1::Dims3{kBatchSize, output_channel, grid_shape});
+
+    return cv2_shuffle;
+}
+
+nvinfer1::IHostMemory *
+buildEngineYolov8Seg(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt,
+                     const std::string &wts_path, float &gd, float &gw, int &max_channels) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+//	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
+    nvinfer1::INetworkDefinition *network = builder->createNetworkV2(
+            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+
+    /*******************************************************************************************************
+    ******************************************  YOLOV8 INPUT  **********************************************
+    *******************************************************************************************************/
+    nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
+    assert(data);
+
+    /*******************************************************************************************************
+    *****************************************  YOLOV8 BACKBONE  ********************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer *conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    nvinfer1::IElementWiseLayer *conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    nvinfer1::IElementWiseLayer *conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer *conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+    nvinfer1::IElementWiseLayer *conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
+    nvinfer1::IElementWiseLayer *conv8 =
+            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+    nvinfer1::IElementWiseLayer *conv9 =
+            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
+                 get_width(1024, gw, max_channels), 5, "model.9");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 HEAD  ********************************************
+    *******************************************************************************************************/
+    float scale[] = {1.0, 1.0, 2.0, 2.0};
+    nvinfer1::IResizeLayer *upsample10 = network->addResize(*conv9->getOutput(0));
+    assert(upsample10);
+    upsample10->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample10->setScales(scale, 4);
+
+    nvinfer1::ITensor *inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat11 = network->addConcatenation(inputTensor11, 2);
+    nvinfer1::IElementWiseLayer *conv12 =
+            C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");
+
+    nvinfer1::IResizeLayer *upsample13 = network->addResize(*conv12->getOutput(0));
+    assert(upsample13);
+    upsample13->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample13->setScales(scale, 4);
+
+    nvinfer1::ITensor *inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat14 = network->addConcatenation(inputTensor14, 2);
+    nvinfer1::IElementWiseLayer *conv15 =
+            C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels),
+                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
+    nvinfer1::IElementWiseLayer *conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0),
+                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.16");
+    nvinfer1::ITensor *inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat17 = network->addConcatenation(inputTensor17, 2);
+    nvinfer1::IElementWiseLayer *conv18 =
+            C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
+    nvinfer1::IElementWiseLayer *conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
+                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.19");
+    nvinfer1::ITensor *inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat20 = network->addConcatenation(inputTensor20, 2);
+    nvinfer1::IElementWiseLayer *conv21 =
+            C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 OUTPUT  ******************************************
+    *******************************************************************************************************/
+    int base_in_channel = (gw == 1.25) ? 80 : 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kNumClass, 100)) : get_width(256, gw, max_channels);
+
+    // output0
+    nvinfer1::IElementWiseLayer *conv22_cv2_0_0 =
+            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0");
+    nvinfer1::IElementWiseLayer *conv22_cv2_0_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1");
+    nvinfer1::IConvolutionLayer *conv22_cv2_0_2 =
+            network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]);
+    conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer *conv22_cv3_0_0 =
+            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0");
+    nvinfer1::IElementWiseLayer *conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.22.cv3.0.1");
+    nvinfer1::IConvolutionLayer *conv22_cv3_0_2 =
+            network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]);
+    conv22_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_0 = network->addConcatenation(inputTensor22_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer *conv22_cv2_1_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0");
+    nvinfer1::IElementWiseLayer *conv22_cv2_1_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1");
+    nvinfer1::IConvolutionLayer *conv22_cv2_1_2 =
+            network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]);
+    conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer *conv22_cv3_1_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0");
+    nvinfer1::IElementWiseLayer *conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.22.cv3.1.1");
+    nvinfer1::IConvolutionLayer *conv22_cv3_1_2 =
+            network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]);
+    conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_1 = network->addConcatenation(inputTensor22_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer *conv22_cv2_2_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0");
+    nvinfer1::IElementWiseLayer *conv22_cv2_2_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1");
+    nvinfer1::IConvolutionLayer *conv22_cv2_2_2 =
+            network->addConvolutionNd(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]);
+    nvinfer1::IElementWiseLayer *conv22_cv3_2_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0");
+    nvinfer1::IElementWiseLayer *conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.22.cv3.2.1");
+    nvinfer1::IConvolutionLayer *conv22_cv3_2_2 =
+            network->addConvolutionNd(*conv22_cv3_2_1->getOutput(0), kNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]);
+    nvinfer1::ITensor *inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_2 = network->addConcatenation(inputTensor22_2, 2);
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 DETECT  ******************************************
+    *******************************************************************************************************/
+
+    nvinfer1::IElementWiseLayer *conv_layers[] = {conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    nvinfer1::IShuffleLayer *shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
+    shuffle22_0->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer *split22_0_0 = network->addSlice(
+            *shuffle22_0->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split22_0_1 = network->addSlice(
+            *shuffle22_0->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl22_0 =
+            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+
+    nvinfer1::IShuffleLayer *shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
+    shuffle22_1->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer *split22_1_0 = network->addSlice(
+            *shuffle22_1->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split22_1_1 = network->addSlice(
+            *shuffle22_1->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl22_1 =
+            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+
+    nvinfer1::IShuffleLayer *shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
+    shuffle22_2->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer *split22_2_0 = network->addSlice(
+            *shuffle22_2->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split22_2_1 = network->addSlice(
+            *shuffle22_2->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl22_2 =
+            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+
+    // det0
+    auto proto_coef_0 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0",
+                                          (kInputH / strides[0]) * (kInputW / strides[0]), gw, "seg");
+    nvinfer1::ITensor *inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0),
+                                                proto_coef_0->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3);
+    cat22_dfl_0->setAxis(1);
+
+    // det1
+    auto proto_coef_1 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1",
+                                          (kInputH / strides[1]) * (kInputW / strides[1]), gw, "seg");
+    nvinfer1::ITensor *inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0),
+                                                proto_coef_1->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3);
+    cat22_dfl_1->setAxis(1);
+
+    // det2
+    auto proto_coef_2 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2",
+                                          (kInputH / strides[2]) * (kInputW / strides[2]), gw, "seg");
+    nvinfer1::ITensor *inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0),
+                                                proto_coef_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);
+    cat22_dfl_2->setAxis(1);
+
+    nvinfer1::IPluginV2Layer *yolo =
+            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer *>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
+                         strides, stridesLength, true, false);
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    auto proto = Proto(network, weightMap, *conv15->getOutput(0), "model.22.proto", gw, max_channels);
+    proto->getOutput(0)->setName(kProtoTensorName);
+    network->markOutput(*proto->getOutput(0));
+
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto *calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
+                                                  kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto &mem: weightMap) {
+        free((void *) (mem.second.values));
+    }
+    return serialized_model;
+}
+
+nvinfer1::IHostMemory *buildEngineYolov8Pose(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config,
+                                             nvinfer1::DataType dt, const std::string &wts_path, float &gd, float &gw,
+                                             int &max_channels) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+//	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
+    nvinfer1::INetworkDefinition *network = builder->createNetworkV2(
+            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+
+    /*******************************************************************************************************
+    ******************************************  YOLOV8 INPUT  **********************************************
+    *******************************************************************************************************/
+    nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
+    assert(data);
+
+    /*******************************************************************************************************
+    *****************************************  YOLOV8 BACKBONE  ********************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer *conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    nvinfer1::IElementWiseLayer *conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    nvinfer1::IElementWiseLayer *conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer *conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+    nvinfer1::IElementWiseLayer *conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.7");
+    nvinfer1::IElementWiseLayer *conv8 =
+            C2F(network, weightMap, *conv7->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+    nvinfer1::IElementWiseLayer *conv9 =
+            SPPF(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels),
+                 get_width(1024, gw, max_channels), 5, "model.9");
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 HEAD  ********************************************
+    *******************************************************************************************************/
+    float scale[] = {1.0, 1.0, 2.0, 2.0};
+    nvinfer1::IResizeLayer *upsample10 = network->addResize(*conv9->getOutput(0));
+    assert(upsample10);
+    upsample10->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample10->setScales(scale, 4);
+
+    nvinfer1::ITensor *inputTensor11[] = {upsample10->getOutput(0), conv6->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat11 = network->addConcatenation(inputTensor11, 2);
+    nvinfer1::IElementWiseLayer *conv12 =
+            C2F(network, weightMap, *cat11->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.12");
+
+    nvinfer1::IResizeLayer *upsample13 = network->addResize(*conv12->getOutput(0));
+    assert(upsample13);
+    upsample13->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample13->setScales(scale, 4);
+
+    nvinfer1::ITensor *inputTensor14[] = {upsample13->getOutput(0), conv4->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat14 = network->addConcatenation(inputTensor14, 2);
+    nvinfer1::IElementWiseLayer *conv15 =
+            C2F(network, weightMap, *cat14->getOutput(0), get_width(256, gw, max_channels),
+                get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.15");
+    nvinfer1::IElementWiseLayer *conv16 = convBnSiLU(network, weightMap, *conv15->getOutput(0),
+                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.16");
+    nvinfer1::ITensor *inputTensor17[] = {conv16->getOutput(0), conv12->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat17 = network->addConcatenation(inputTensor17, 2);
+    nvinfer1::IElementWiseLayer *conv18 =
+            C2F(network, weightMap, *cat17->getOutput(0), get_width(512, gw, max_channels),
+                get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.18");
+    nvinfer1::IElementWiseLayer *conv19 = convBnSiLU(network, weightMap, *conv18->getOutput(0),
+                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.19");
+    nvinfer1::ITensor *inputTensor20[] = {conv19->getOutput(0), conv9->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat20 = network->addConcatenation(inputTensor20, 2);
+    nvinfer1::IElementWiseLayer *conv21 =
+            C2F(network, weightMap, *cat20->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.21");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 OUTPUT  ******************************************
+    *******************************************************************************************************/
+    int base_in_channel = (gw == 1.25) ? 80 : 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kPoseNumClass, 100)) : get_width(256, gw, max_channels);
+
+    // output0
+    nvinfer1::IElementWiseLayer *conv22_cv2_0_0 =
+            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.0");
+    nvinfer1::IElementWiseLayer *conv22_cv2_0_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.0.1");
+    nvinfer1::IConvolutionLayer *conv22_cv2_0_2 =
+            network->addConvolutionNd(*conv22_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.0.2.weight"], weightMap["model.22.cv2.0.2.bias"]);
+    conv22_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer *conv22_cv3_0_0 =
+            convBnSiLU(network, weightMap, *conv15->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.0.0");
+    nvinfer1::IElementWiseLayer *conv22_cv3_0_1 = convBnSiLU(network, weightMap, *conv22_cv3_0_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.22.cv3.0.1");
+    nvinfer1::IConvolutionLayer *conv22_cv3_0_2 =
+            network->addConvolutionNd(*conv22_cv3_0_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.0.2.weight"], weightMap["model.22.cv3.0.2.bias"]);
+    conv22_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor22_0[] = {conv22_cv2_0_2->getOutput(0), conv22_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_0 = network->addConcatenation(inputTensor22_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer *conv22_cv2_1_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.0");
+    nvinfer1::IElementWiseLayer *conv22_cv2_1_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.1.1");
+    nvinfer1::IConvolutionLayer *conv22_cv2_1_2 =
+            network->addConvolutionNd(*conv22_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.1.2.weight"], weightMap["model.22.cv2.1.2.bias"]);
+    conv22_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer *conv22_cv3_1_0 =
+            convBnSiLU(network, weightMap, *conv18->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.1.0");
+    nvinfer1::IElementWiseLayer *conv22_cv3_1_1 = convBnSiLU(network, weightMap, *conv22_cv3_1_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.22.cv3.1.1");
+    nvinfer1::IConvolutionLayer *conv22_cv3_1_2 =
+            network->addConvolutionNd(*conv22_cv3_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.1.2.weight"], weightMap["model.22.cv3.1.2.bias"]);
+    conv22_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv22_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor22_1[] = {conv22_cv2_1_2->getOutput(0), conv22_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_1 = network->addConcatenation(inputTensor22_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer *conv22_cv2_2_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.0");
+    nvinfer1::IElementWiseLayer *conv22_cv2_2_1 =
+            convBnSiLU(network, weightMap, *conv22_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.22.cv2.2.1");
+    nvinfer1::IConvolutionLayer *conv22_cv2_2_2 =
+            network->addConvolutionNd(*conv22_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv2.2.2.weight"], weightMap["model.22.cv2.2.2.bias"]);
+    nvinfer1::IElementWiseLayer *conv22_cv3_2_0 =
+            convBnSiLU(network, weightMap, *conv21->getOutput(0), base_out_channel, 3, 1, 1, "model.22.cv3.2.0");
+    nvinfer1::IElementWiseLayer *conv22_cv3_2_1 = convBnSiLU(network, weightMap, *conv22_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.22.cv3.2.1");
+    nvinfer1::IConvolutionLayer *conv22_cv3_2_2 =
+            network->addConvolutionNd(*conv22_cv3_2_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.22.cv3.2.2.weight"], weightMap["model.22.cv3.2.2.bias"]);
+    nvinfer1::ITensor *inputTensor22_2[] = {conv22_cv2_2_2->getOutput(0), conv22_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_2 = network->addConcatenation(inputTensor22_2, 2);
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 DETECT  ******************************************
+    *******************************************************************************************************/
+
+    nvinfer1::IElementWiseLayer *conv_layers[] = {conv3, conv5, conv7};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    /**************************************************************************************P3****************************************************************************************************************************************/
+    nvinfer1::IShuffleLayer *shuffle22_0 = network->addShuffle(*cat22_0->getOutput(0));
+    shuffle22_0->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer *split22_0_0 = network->addSlice(
+            *shuffle22_0->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split22_0_1 = network->addSlice(
+            *shuffle22_0->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl22_0 =
+            DFL(network, weightMap, *split22_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+
+    // det0
+    auto shuffle_conv15 = cv4_conv_combined(network, weightMap, *conv15->getOutput(0), "model.22.cv4.0",
+                                            (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose");
+
+    nvinfer1::ITensor *inputTensor22_dfl_0[] = {dfl22_0->getOutput(0), split22_0_1->getOutput(0),
+                                                shuffle_conv15->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_dfl_0 = network->addConcatenation(inputTensor22_dfl_0, 3);
+    cat22_dfl_0->setAxis(1);
+
+    /********************************************************************************************P4**********************************************************************************************************************************/
+    nvinfer1::IShuffleLayer *shuffle22_1 = network->addShuffle(*cat22_1->getOutput(0));
+    shuffle22_1->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer *split22_1_0 = network->addSlice(
+            *shuffle22_1->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split22_1_1 = network->addSlice(
+            *shuffle22_1->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl22_1 =
+            DFL(network, weightMap, *split22_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+
+    // det1
+    auto shuffle_conv18 = cv4_conv_combined(network, weightMap, *conv18->getOutput(0), "model.22.cv4.1",
+                                            (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose");
+
+    nvinfer1::ITensor *inputTensor22_dfl_1[] = {dfl22_1->getOutput(0), split22_1_1->getOutput(0),
+                                                shuffle_conv18->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_dfl_1 = network->addConcatenation(inputTensor22_dfl_1, 3);
+    cat22_dfl_1->setAxis(1);
+
+    /********************************************************************************************P5**********************************************************************************************************************************/
+    nvinfer1::IShuffleLayer *shuffle22_2 = network->addShuffle(*cat22_2->getOutput(0));
+    shuffle22_2->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer *split22_2_0 = network->addSlice(
+            *shuffle22_2->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split22_2_1 = network->addSlice(
+            *shuffle22_2->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl22_2 =
+            DFL(network, weightMap, *split22_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.22.dfl.conv.weight");
+
+    // det2
+    auto shuffle_conv21 = cv4_conv_combined(network, weightMap, *conv21->getOutput(0), "model.22.cv4.2",
+                                            (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose");
+    nvinfer1::ITensor *inputTensor22_dfl_2[] = {dfl22_2->getOutput(0), split22_2_1->getOutput(0),
+                                                shuffle_conv21->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat22_dfl_2 = network->addConcatenation(inputTensor22_dfl_2, 3);
+    cat22_dfl_2->setAxis(1);
+
+    nvinfer1::IPluginV2Layer *yolo =
+            addYoLoLayer(network, std::vector<nvinfer1::IConcatenationLayer *>{cat22_dfl_0, cat22_dfl_1, cat22_dfl_2},
+                         strides, stridesLength, false, true);
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
+                                                  kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto &mem: weightMap) {
+        free((void *) (mem.second.values));
+    }
+    return serialized_model;
+}
+
+nvinfer1::IHostMemory *
+buildEngineYolov8PoseP6(nvinfer1::IBuilder *builder, nvinfer1::IBuilderConfig *config, nvinfer1::DataType dt,
+                        const std::string &wts_path, float &gd, float &gw, int &max_channels) {
+    std::map<std::string, nvinfer1::Weights> weightMap = loadWeights(wts_path);
+//	nvinfer1::INetworkDefinition *network = builder->createNetworkV2(0U);
+    nvinfer1::INetworkDefinition *network = builder->createNetworkV2(
+            1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH));
+    /*******************************************************************************************************
+    ******************************************  YOLOV8 INPUT  **********************************************
+    *******************************************************************************************************/
+    nvinfer1::ITensor *data = network->addInput(kInputTensorName, dt, nvinfer1::Dims4{kBatchSize, 3, kInputH, kInputW});
+    assert(data);
+    /*******************************************************************************************************
+    *****************************************  YOLOV8 BACKBONE  ********************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer *conv0 =
+            convBnSiLU(network, weightMap, *data, get_width(64, gw, max_channels), 3, 2, 1, "model.0");
+    nvinfer1::IElementWiseLayer *conv1 =
+            convBnSiLU(network, weightMap, *conv0->getOutput(0), get_width(128, gw, max_channels), 3, 2, 1, "model.1");
+    // 11233
+    nvinfer1::IElementWiseLayer *conv2 = C2F(network, weightMap, *conv1->getOutput(0), get_width(128, gw, max_channels),
+                                             get_width(128, gw, max_channels), get_depth(3, gd), true, 0.5, "model.2");
+    nvinfer1::IElementWiseLayer *conv3 =
+            convBnSiLU(network, weightMap, *conv2->getOutput(0), get_width(256, gw, max_channels), 3, 2, 1, "model.3");
+    // 22466
+    nvinfer1::IElementWiseLayer *conv4 = C2F(network, weightMap, *conv3->getOutput(0), get_width(256, gw, max_channels),
+                                             get_width(256, gw, max_channels), get_depth(6, gd), true, 0.5, "model.4");
+    nvinfer1::IElementWiseLayer *conv5 =
+            convBnSiLU(network, weightMap, *conv4->getOutput(0), get_width(512, gw, max_channels), 3, 2, 1, "model.5");
+    // 22466
+    nvinfer1::IElementWiseLayer *conv6 = C2F(network, weightMap, *conv5->getOutput(0), get_width(512, gw, max_channels),
+                                             get_width(512, gw, max_channels), get_depth(6, gd), true, 0.5, "model.6");
+
+    nvinfer1::IElementWiseLayer *conv7 =
+            convBnSiLU(network, weightMap, *conv6->getOutput(0), get_width(768, gw, max_channels), 3, 2, 1, "model.7");
+    nvinfer1::IElementWiseLayer *conv8 = C2F(network, weightMap, *conv7->getOutput(0), get_width(768, gw, max_channels),
+                                             get_width(768, gw, max_channels), get_depth(3, gd), true, 0.5, "model.8");
+
+    nvinfer1::IElementWiseLayer *conv9 =
+            convBnSiLU(network, weightMap, *conv8->getOutput(0), get_width(1024, gw, max_channels), 3, 2, 1, "model.9");
+    nvinfer1::IElementWiseLayer *conv10 =
+            C2F(network, weightMap, *conv9->getOutput(0), get_width(1024, gw, max_channels),
+                get_width(1024, gw, max_channels), get_depth(3, gd), true, 0.5, "model.10");
+
+    nvinfer1::IElementWiseLayer *conv11 =
+            SPPF(network, weightMap, *conv10->getOutput(0), get_width(1024, gw, max_channels),
+                 get_width(1024, gw, max_channels), 5, "model.11");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 HEAD  ********************************************
+    *******************************************************************************************************/
+    // Head
+    float scale[] = {1.0, 1.0, 2.0, 2.0};  // scale used for upsampling
+
+    // P5
+    nvinfer1::IResizeLayer *upsample12 = network->addResize(*conv11->getOutput(0));
+    upsample12->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample12->setScales(scale, 4);
+    nvinfer1::ITensor *concat13_inputs[] = {upsample12->getOutput(0), conv8->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat13 = network->addConcatenation(concat13_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv14 =
+            C2(network, weightMap, *concat13->getOutput(0), get_width(768, gw, max_channels),
+               get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.14");
+
+    // P4
+    nvinfer1::IResizeLayer *upsample15 = network->addResize(*conv14->getOutput(0));
+    upsample15->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample15->setScales(scale, 4);
+    nvinfer1::ITensor *concat16_inputs[] = {upsample15->getOutput(0), conv6->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat16 = network->addConcatenation(concat16_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv17 =
+            C2(network, weightMap, *concat16->getOutput(0), get_width(512, gw, max_channels),
+               get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.17");
+
+    // P3
+    nvinfer1::IResizeLayer *upsample18 = network->addResize(*conv17->getOutput(0));
+    upsample18->setResizeMode(nvinfer1::InterpolationMode::kNEAREST);
+    upsample18->setScales(scale, 4);
+    nvinfer1::ITensor *concat19_inputs[] = {upsample18->getOutput(0), conv4->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat19 = network->addConcatenation(concat19_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv20 =
+            C2(network, weightMap, *concat19->getOutput(0), get_width(256, gw, max_channels),
+               get_width(256, gw, max_channels), get_depth(3, gd), false, 0.5, "model.20");
+
+    // Additional layers for P4, P5, P6
+    // P4/16-medium
+    nvinfer1::IElementWiseLayer *conv21 = convBnSiLU(network, weightMap, *conv20->getOutput(0),
+                                                     get_width(256, gw, max_channels), 3, 2, 1, "model.21");
+    nvinfer1::ITensor *concat22_inputs[] = {conv21->getOutput(0), conv17->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat22 = network->addConcatenation(concat22_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv23 =
+            C2(network, weightMap, *concat22->getOutput(0), get_width(512, gw, max_channels),
+               get_width(512, gw, max_channels), get_depth(3, gd), false, 0.5, "model.23");
+
+    // P5/32-large
+    nvinfer1::IElementWiseLayer *conv24 = convBnSiLU(network, weightMap, *conv23->getOutput(0),
+                                                     get_width(512, gw, max_channels), 3, 2, 1, "model.24");
+    nvinfer1::ITensor *concat25_inputs[] = {conv24->getOutput(0), conv14->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat25 = network->addConcatenation(concat25_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv26 =
+            C2(network, weightMap, *concat25->getOutput(0), get_width(768, gw, max_channels),
+               get_width(768, gw, max_channels), get_depth(3, gd), false, 0.5, "model.26");
+
+    // P6/64-xlarge
+    nvinfer1::IElementWiseLayer *conv27 = convBnSiLU(network, weightMap, *conv26->getOutput(0),
+                                                     get_width(768, gw, max_channels), 3, 2, 1, "model.27");
+    nvinfer1::ITensor *concat28_inputs[] = {conv27->getOutput(0), conv11->getOutput(0)};
+    nvinfer1::IConcatenationLayer *concat28 = network->addConcatenation(concat28_inputs, 2);
+    nvinfer1::IElementWiseLayer *conv29 =
+            C2(network, weightMap, *concat28->getOutput(0), get_width(1024, gw, max_channels),
+               get_width(1024, gw, max_channels), get_depth(3, gd), false, 0.5, "model.29");
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 OUTPUT  ******************************************
+    *******************************************************************************************************/
+    int base_in_channel = (gw == 1.25) ? 80 : 64;
+    int base_out_channel = (gw == 0.25) ? std::max(64, std::min(kPoseNumClass, 100)) : get_width(256, gw, max_channels);
+
+    // output0
+    nvinfer1::IElementWiseLayer *conv30_cv2_0_0 =
+            convBnSiLU(network, weightMap, *conv20->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.0");
+    nvinfer1::IElementWiseLayer *conv30_cv2_0_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_0_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.0.1");
+    nvinfer1::IConvolutionLayer *conv30_cv2_0_2 =
+            network->addConvolutionNd(*conv30_cv2_0_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv2.0.2.weight"], weightMap["model.30.cv2.0.2.bias"]);
+    conv30_cv2_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+
+    conv30_cv2_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+
+    nvinfer1::IElementWiseLayer *conv30_cv3_0_0 =
+            convBnSiLU(network, weightMap, *conv20->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.0.0");
+
+    nvinfer1::IElementWiseLayer *conv30_cv3_0_1 = convBnSiLU(network, weightMap, *conv30_cv3_0_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.30.cv3.0.1");
+    nvinfer1::IConvolutionLayer *conv30_cv3_0_2 =
+            network->addConvolutionNd(*conv30_cv3_0_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv3.0.2.weight"], weightMap["model.30.cv3.0.2.bias"]);
+    conv30_cv3_0_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_0_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor30_0[] = {conv30_cv2_0_2->getOutput(0), conv30_cv3_0_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_0 = network->addConcatenation(inputTensor30_0, 2);
+
+    // output1
+    nvinfer1::IElementWiseLayer *conv30_cv2_1_0 =
+            convBnSiLU(network, weightMap, *conv23->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.0");
+    nvinfer1::IElementWiseLayer *conv30_cv2_1_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_1_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.1.1");
+    nvinfer1::IConvolutionLayer *conv30_cv2_1_2 =
+            network->addConvolutionNd(*conv30_cv2_1_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv2.1.2.weight"], weightMap["model.30.cv2.1.2.bias"]);
+    conv30_cv2_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv2_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer *conv30_cv3_1_0 =
+            convBnSiLU(network, weightMap, *conv23->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.1.0");
+    nvinfer1::IElementWiseLayer *conv30_cv3_1_1 = convBnSiLU(network, weightMap, *conv30_cv3_1_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.30.cv3.1.1");
+    nvinfer1::IConvolutionLayer *conv30_cv3_1_2 =
+            network->addConvolutionNd(*conv30_cv3_1_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv3.1.2.weight"], weightMap["model.30.cv3.1.2.bias"]);
+    conv30_cv3_1_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_1_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor30_1[] = {conv30_cv2_1_2->getOutput(0), conv30_cv3_1_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_1 = network->addConcatenation(inputTensor30_1, 2);
+
+    // output2
+    nvinfer1::IElementWiseLayer *conv30_cv2_2_0 =
+            convBnSiLU(network, weightMap, *conv26->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.0");
+    nvinfer1::IElementWiseLayer *conv30_cv2_2_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_2_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.2.1");
+    nvinfer1::IConvolutionLayer *conv30_cv2_2_2 =
+            network->addConvolutionNd(*conv30_cv2_2_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv2.2.2.weight"], weightMap["model.30.cv2.2.2.bias"]);
+    conv30_cv2_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv2_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer *conv30_cv3_2_0 =
+            convBnSiLU(network, weightMap, *conv26->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.2.0");
+    nvinfer1::IElementWiseLayer *conv30_cv3_2_1 = convBnSiLU(network, weightMap, *conv30_cv3_2_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.30.cv3.2.1");
+    nvinfer1::IConvolutionLayer *conv30_cv3_2_2 =
+            network->addConvolutionNd(*conv30_cv3_2_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv3.2.2.weight"], weightMap["model.30.cv3.2.2.bias"]);
+    conv30_cv3_2_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_2_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor30_2[] = {conv30_cv2_2_2->getOutput(0), conv30_cv3_2_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_2 = network->addConcatenation(inputTensor30_2, 2);
+
+    // output3
+    nvinfer1::IElementWiseLayer *conv30_cv2_3_0 =
+            convBnSiLU(network, weightMap, *conv29->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.0");
+    nvinfer1::IElementWiseLayer *conv30_cv2_3_1 =
+            convBnSiLU(network, weightMap, *conv30_cv2_3_0->getOutput(0), base_in_channel, 3, 1, 1, "model.30.cv2.3.1");
+    nvinfer1::IConvolutionLayer *conv30_cv2_3_2 =
+            network->addConvolutionNd(*conv30_cv2_3_1->getOutput(0), 64, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv2.3.2.weight"], weightMap["model.30.cv2.3.2.bias"]);
+    conv30_cv2_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv2_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::IElementWiseLayer *conv30_cv3_3_0 =
+            convBnSiLU(network, weightMap, *conv29->getOutput(0), base_out_channel, 3, 1, 1, "model.30.cv3.3.0");
+    nvinfer1::IElementWiseLayer *conv30_cv3_3_1 = convBnSiLU(network, weightMap, *conv30_cv3_3_0->getOutput(0),
+                                                             base_out_channel, 3, 1, 1, "model.30.cv3.3.1");
+    nvinfer1::IConvolutionLayer *conv30_cv3_3_2 =
+            network->addConvolutionNd(*conv30_cv3_3_1->getOutput(0), kPoseNumClass, nvinfer1::DimsHW{1, 1},
+                                      weightMap["model.30.cv3.3.2.weight"], weightMap["model.30.cv3.3.2.bias"]);
+    conv30_cv3_3_2->setStrideNd(nvinfer1::DimsHW{1, 1});
+    conv30_cv3_3_2->setPaddingNd(nvinfer1::DimsHW{0, 0});
+    nvinfer1::ITensor *inputTensor30_3[] = {conv30_cv2_3_2->getOutput(0), conv30_cv3_3_2->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_3 = network->addConcatenation(inputTensor30_3, 2);
+
+    /*******************************************************************************************************
+    *********************************************  YOLOV8 DETECT  ******************************************
+    *******************************************************************************************************/
+    nvinfer1::IElementWiseLayer *conv_layers[] = {conv3, conv5, conv7, conv9};
+    int strides[sizeof(conv_layers) / sizeof(conv_layers[0])];
+    calculateStrides(conv_layers, sizeof(conv_layers) / sizeof(conv_layers[0]), kInputH, strides);
+    int stridesLength = sizeof(strides) / sizeof(int);
+
+    // P3 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer *shuffle30_0 =
+            network->addShuffle(*cat30_0->getOutput(0));  // Reusing the previous cat30_0 as P3 concatenation layer
+    shuffle30_0->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])});
+    nvinfer1::ISliceLayer *split30_0_0 = network->addSlice(
+            *shuffle30_0->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split30_0_1 = network->addSlice(
+            *shuffle30_0->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[0]) * (kInputW / strides[0])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl30_0 =
+            DFL(network, weightMap, *split30_0_0->getOutput(0), 4, (kInputH / strides[0]) * (kInputW / strides[0]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+
+    // det0
+    auto shuffle_conv20 = cv4_conv_combined(network, weightMap, *conv20->getOutput(0), "model.30.cv4.0",
+                                            (kInputH / strides[0]) * (kInputW / strides[0]), gw, "pose");
+    nvinfer1::ITensor *inputTensor30_dfl_0[] = {dfl30_0->getOutput(0), split30_0_1->getOutput(0),
+                                                shuffle_conv20->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_dfl_0 = network->addConcatenation(inputTensor30_dfl_0, 3);
+    cat30_dfl_0->setAxis(1);
+
+    // P4 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer *shuffle30_1 =
+            network->addShuffle(*cat30_1->getOutput(0));  // Reusing the previous cat30_1 as P4 concatenation layer
+    shuffle30_1->setReshapeDimensions(
+            nvinfer1::Dims3{kBatchSize, 64 + kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])});
+    nvinfer1::ISliceLayer *split30_1_0 = network->addSlice(
+            *shuffle30_1->getOutput(0),
+            nvinfer1::Dims3{0, 0, 0},
+            nvinfer1::Dims3{kBatchSize, 64, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::ISliceLayer *split30_1_1 = network->addSlice(
+            *shuffle30_1->getOutput(0),
+            nvinfer1::Dims3{0, 64, 0},
+            nvinfer1::Dims3{kBatchSize, kPoseNumClass, (kInputH / strides[1]) * (kInputW / strides[1])},
+            nvinfer1::Dims3{1, 1, 1});
+    nvinfer1::IShuffleLayer *dfl30_1 =
+            DFL(network, weightMap, *split30_1_0->getOutput(0), 4, (kInputH / strides[1]) * (kInputW / strides[1]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+
+    // det1
+    auto shuffle_conv23 = cv4_conv_combined(network, weightMap, *conv23->getOutput(0), "model.30.cv4.1",
+                                            (kInputH / strides[1]) * (kInputW / strides[1]), gw, "pose");
+    nvinfer1::ITensor *inputTensor30_dfl_1[] = {dfl30_1->getOutput(0), split30_1_1->getOutput(0),
+                                                shuffle_conv23->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_dfl_1 = network->addConcatenation(inputTensor30_dfl_1, 3);
+    cat30_dfl_1->setAxis(1);
+
+    // P5 processing steps (remains unchanged)
+    nvinfer1::IShuffleLayer *shuffle30_2 =
+            network->addShuffle(*cat30_2->getOutput(0));  // Reusing the previous cat30_2 as P5 concatenation layer
+    shuffle30_2->setReshapeDimensions(nvinfer1::Dims3{kBatchSize,64 + kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])});
+    nvinfer1::ISliceLayer *split30_2_0 = network->addSlice(
+            *shuffle30_2->getOutput(0),
+            nvinfer1::Dims3{0,0, 0},
+            nvinfer1::Dims3{kBatchSize,64, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1,1, 1});
+    nvinfer1::ISliceLayer *split30_2_1 = network->addSlice(
+            *shuffle30_2->getOutput(0),
+            nvinfer1::Dims3{0,64, 0},
+            nvinfer1::Dims3{kBatchSize,kPoseNumClass, (kInputH / strides[2]) * (kInputW / strides[2])},
+            nvinfer1::Dims3{1,1, 1});
+    nvinfer1::IShuffleLayer *dfl30_2 =
+            DFL(network, weightMap, *split30_2_0->getOutput(0), 4, (kInputH / strides[2]) * (kInputW / strides[2]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+
+    // det2
+    auto shuffle_conv26 = cv4_conv_combined(network, weightMap, *conv26->getOutput(0), "model.30.cv4.2",
+                                            (kInputH / strides[2]) * (kInputW / strides[2]), gw, "pose");
+    nvinfer1::ITensor *inputTensor30_dfl_2[] = {dfl30_2->getOutput(0), split30_2_1->getOutput(0),
+                                                shuffle_conv26->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_dfl_2 = network->addConcatenation(inputTensor30_dfl_2, 3);
+    cat30_dfl_2->setAxis(1);
+
+    // P6 processing steps
+    nvinfer1::IShuffleLayer *shuffle30_3 = network->addShuffle(*cat30_3->getOutput(0));
+    shuffle30_3->setReshapeDimensions(nvinfer1::Dims3{kBatchSize,64 + kPoseNumClass, (kInputH / strides[3]) * (kInputW / strides[3])});
+    nvinfer1::ISliceLayer *split30_3_0 = network->addSlice(
+            *shuffle30_3->getOutput(0),
+            nvinfer1::Dims3{0,0, 0},
+            nvinfer1::Dims3{kBatchSize,64, (kInputH / strides[3]) * (kInputW / strides[3])},
+            nvinfer1::Dims3{1,1, 1});
+    nvinfer1::ISliceLayer *split30_3_1 = network->addSlice(
+            *shuffle30_3->getOutput(0),
+            nvinfer1::Dims3{0,64, 0},
+            nvinfer1::Dims3{kBatchSize,kPoseNumClass, (kInputH / strides[3]) * (kInputW / strides[3])},
+            nvinfer1::Dims3{1,1, 1});
+    nvinfer1::IShuffleLayer *dfl30_3 =
+            DFL(network, weightMap, *split30_3_0->getOutput(0), 4, (kInputH / strides[3]) * (kInputW / strides[3]), 1,
+                1, 0, "model.30.dfl.conv.weight");
+
+    // det3
+    auto shuffle_conv29 = cv4_conv_combined(network, weightMap, *conv29->getOutput(0), "model.30.cv4.3",
+                                            (kInputH / strides[3]) * (kInputW / strides[3]), gw, "pose");
+    nvinfer1::ITensor *inputTensor30_dfl_3[] = {dfl30_3->getOutput(0), split30_3_1->getOutput(0),
+                                                shuffle_conv29->getOutput(0)};
+    nvinfer1::IConcatenationLayer *cat30_dfl_3 = network->addConcatenation(inputTensor30_dfl_3, 3);
+    cat30_dfl_3->setAxis(1);
+
+    nvinfer1::IPluginV2Layer *yolo = addYoLoLayer(
+            network, std::vector<nvinfer1::IConcatenationLayer *>{cat30_dfl_0, cat30_dfl_1, cat30_dfl_2, cat30_dfl_3},
+            strides, stridesLength, false, true);
+    yolo->getOutput(0)->setName(kOutputTensorName);
+    network->markOutput(*yolo->getOutput(0));
+
+    config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 16 * (1 << 20));
+
+#if defined(USE_FP16)
+    config->setFlag(nvinfer1::BuilderFlag::kFP16);
+#elif defined(USE_INT8)
+    std::cout << "Your platform support int8: " << (builder->platformHasFastInt8() ? "true" : "false") << std::endl;
+    assert(builder->platformHasFastInt8());
+    config->setFlag(nvinfer1::BuilderFlag::kINT8);
+    auto* calibrator = new Int8EntropyCalibrator2(1, kInputW, kInputH, kInputQuantizationFolder, "int8calib.table",
+                                                  kInputTensorName);
+    config->setInt8Calibrator(calibrator);
+#endif
+
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    nvinfer1::IHostMemory *serialized_model = builder->buildSerializedNetwork(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    delete network;
+
+    for (auto &mem: weightMap) {
+        free((void *) (mem.second.values));
+    }
+    return serialized_model;
+}
diff --git a/src/postprocess.cpp b/src/postprocess.cpp
new file mode 100644
index 0000000..f19acc0
--- /dev/null
+++ b/src/postprocess.cpp
@@ -0,0 +1,269 @@
+#include "postprocess.h"
+#include "utils.h"
+
+cv::Rect get_rect(cv::Mat& img, float bbox[4]) {
+    float l, r, t, b;
+    float r_w = kInputW / (img.cols * 1.0);
+    float r_h = kInputH / (img.rows * 1.0);
+
+    if (r_h > r_w) {
+        l = bbox[0];
+        r = bbox[2];
+        t = bbox[1] - (kInputH - r_w * img.rows) / 2;
+        b = bbox[3] - (kInputH - r_w * img.rows) / 2;
+        l = l / r_w;
+        r = r / r_w;
+        t = t / r_w;
+        b = b / r_w;
+    } else {
+        l = bbox[0] - (kInputW - r_h * img.cols) / 2;
+        r = bbox[2] - (kInputW - r_h * img.cols) / 2;
+        t = bbox[1];
+        b = bbox[3];
+        l = l / r_h;
+        r = r / r_h;
+        t = t / r_h;
+        b = b / r_h;
+    }
+    l = std::max(0.0f, l);
+    t = std::max(0.0f, t);
+    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
+    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));
+
+    return cv::Rect(int(round(l)), int(round(t)), width, height);
+}
+
+cv::Rect get_rect_adapt_landmark(cv::Mat& img, float bbox[4], float lmk[kNumberOfPoints * 3]) {
+    float l, r, t, b;
+    float r_w = kInputW / (img.cols * 1.0);
+    float r_h = kInputH / (img.rows * 1.0);
+    if (r_h > r_w) {
+        l = bbox[0] / r_w;
+        r = bbox[2] / r_w;
+        t = (bbox[1] - (kInputH - r_w * img.rows) / 2) / r_w;
+        b = (bbox[3] - (kInputH - r_w * img.rows) / 2) / r_w;
+        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
+            lmk[i] /= r_w;
+            lmk[i + 1] = (lmk[i + 1] - (kInputH - r_w * img.rows) / 2) / r_w;
+            // lmk[i + 2]
+        }
+    } else {
+        l = (bbox[0] - (kInputW - r_h * img.cols) / 2) / r_h;
+        r = (bbox[2] - (kInputW - r_h * img.cols) / 2) / r_h;
+        t = bbox[1] / r_h;
+        b = bbox[3] / r_h;
+        for (int i = 0; i < kNumberOfPoints * 3; i += 3) {
+            lmk[i] = (lmk[i] - (kInputW - r_h * img.cols) / 2) / r_h;
+            lmk[i + 1] /= r_h;
+            // lmk[i + 2]
+        }
+    }
+    l = std::max(0.0f, l);
+    t = std::max(0.0f, t);
+    int width = std::max(0, std::min(int(round(r - l)), img.cols - int(round(l))));
+    int height = std::max(0, std::min(int(round(b - t)), img.rows - int(round(t))));
+
+    return cv::Rect(int(round(l)), int(round(t)), width, height);
+}
+
+static float iou(float lbox[4], float rbox[4]) {
+    float interBox[] = {
+            (std::max)(lbox[0], rbox[0]),
+            (std::min)(lbox[2], rbox[2]),
+            (std::max)(lbox[1], rbox[1]),
+            (std::min)(lbox[3], rbox[3]),
+    };
+
+    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
+        return 0.0f;
+
+    float interBoxS = (interBox[1] - interBox[0]) * (interBox[3] - interBox[2]);
+    float unionBoxS = (lbox[2] - lbox[0]) * (lbox[3] - lbox[1]) + (rbox[2] - rbox[0]) * (rbox[3] - rbox[1]) - interBoxS;
+    return interBoxS / unionBoxS;
+}
+
+static bool cmp(const Detection& a, const Detection& b) {
+    if (a.conf == b.conf) {
+        return a.bbox[0] < b.bbox[0];
+    }
+    return a.conf > b.conf;
+}
+
+void nms(std::vector<Detection>& res, float* output, float conf_thresh, float nms_thresh) {
+    int det_size = sizeof(Detection) / sizeof(float);
+    std::map<float, std::vector<Detection>> m;
+
+    for (int i = 0; i < output[0]; i++) {
+        if (output[1 + det_size * i + 4] <= conf_thresh)
+            continue;
+        Detection det;
+        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
+        if (m.count(det.class_id) == 0)
+            m.emplace(det.class_id, std::vector<Detection>());
+        m[det.class_id].push_back(det);
+    }
+    for (auto it = m.begin(); it != m.end(); it++) {
+        auto& dets = it->second;
+        std::sort(dets.begin(), dets.end(), cmp);
+        for (size_t m = 0; m < dets.size(); ++m) {
+            auto& item = dets[m];
+            res.push_back(item);
+            for (size_t n = m + 1; n < dets.size(); ++n) {
+                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
+                    dets.erase(dets.begin() + n);
+                    --n;
+                }
+            }
+        }
+    }
+}
+
+void batch_nms(std::vector<std::vector<Detection>>& res_batch, float* output, int batch_size, int output_size,
+               float conf_thresh, float nms_thresh) {
+    res_batch.resize(batch_size);
+    for (int i = 0; i < batch_size; i++) {
+        nms(res_batch[i], &output[i * output_size], conf_thresh, nms_thresh);
+    }
+}
+
+void process_decode_ptr_host(std::vector<Detection>& res, const float* decode_ptr_host, int bbox_element, cv::Mat& img,
+                             int count) {
+    Detection det;
+    for (int i = 0; i < count; i++) {
+        int basic_pos = 1 + i * bbox_element;
+        int keep_flag = decode_ptr_host[basic_pos + 6];
+        if (keep_flag == 1) {
+            det.bbox[0] = decode_ptr_host[basic_pos + 0];
+            det.bbox[1] = decode_ptr_host[basic_pos + 1];
+            det.bbox[2] = decode_ptr_host[basic_pos + 2];
+            det.bbox[3] = decode_ptr_host[basic_pos + 3];
+            det.conf = decode_ptr_host[basic_pos + 4];
+            det.class_id = decode_ptr_host[basic_pos + 5];
+            res.push_back(det);
+        }
+    }
+}
+
+void batch_process(std::vector<std::vector<Detection>>& res_batch, const float* decode_ptr_host, int batch_size,
+                   int bbox_element, const std::vector<cv::Mat>& img_batch) {
+    res_batch.resize(batch_size);
+    int count = static_cast<int>(*decode_ptr_host);
+    count = std::min(count, kMaxNumOutputBbox);
+    for (int i = 0; i < batch_size; i++) {
+        auto& img = const_cast<cv::Mat&>(img_batch[i]);
+        process_decode_ptr_host(res_batch[i], &decode_ptr_host[i * count], bbox_element, img, count);
+    }
+}
+
+void draw_bbox(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
+    for (size_t i = 0; i < img_batch.size(); i++) {
+        auto& res = res_batch[i];
+        cv::Mat img = img_batch[i];
+        for (size_t j = 0; j < res.size(); j++) {
+            cv::Rect r = get_rect(img, res[j].bbox);
+            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
+                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
+        }
+    }
+}
+
+void draw_bbox_keypoints_line(std::vector<cv::Mat>& img_batch, std::vector<std::vector<Detection>>& res_batch) {
+    const std::vector<std::pair<int, int>> skeleton_pairs = {
+            {0, 1}, {0, 2},  {0, 5}, {0, 6},  {1, 2},   {1, 3},   {2, 4},   {5, 6},   {5, 7},  {5, 11},
+            {6, 8}, {6, 12}, {7, 9}, {8, 10}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};
+
+    for (size_t i = 0; i < img_batch.size(); i++) {
+        auto& res = res_batch[i];
+        cv::Mat img = img_batch[i];
+        for (size_t j = 0; j < res.size(); j++) {
+            cv::Rect r = get_rect_adapt_landmark(img, res[j].bbox, res[j].keypoints);
+            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
+                        cv::Scalar(0xFF, 0xFF, 0xFF), 2);
+
+            for (int k = 0; k < kNumberOfPoints * 3; k += 3) {
+                if (res[j].keypoints[k + 2] > 0.5) {
+                    cv::circle(img, cv::Point((int)res[j].keypoints[k], (int)res[j].keypoints[k + 1]), 3,
+                               cv::Scalar(0, 0x27, 0xC1), -1);
+                }
+            }
+
+            for (const auto& bone : skeleton_pairs) {
+                int kp1_idx = bone.first * 3;
+                int kp2_idx = bone.second * 3;
+                if (res[j].keypoints[kp1_idx + 2] > 0.5 && res[j].keypoints[kp2_idx + 2] > 0.5) {
+                    cv::Point p1((int)res[j].keypoints[kp1_idx], (int)res[j].keypoints[kp1_idx + 1]);
+                    cv::Point p2((int)res[j].keypoints[kp2_idx], (int)res[j].keypoints[kp2_idx + 1]);
+                    cv::line(img, p1, p2, cv::Scalar(0, 0x27, 0xC1), 2);
+                }
+            }
+        }
+    }
+}
+
+cv::Mat scale_mask(cv::Mat mask, cv::Mat img) {
+    int x, y, w, h;
+    float r_w = kInputW / (img.cols * 1.0);
+    float r_h = kInputH / (img.rows * 1.0);
+    if (r_h > r_w) {
+        w = kInputW;
+        h = r_w * img.rows;
+        x = 0;
+        y = (kInputH - h) / 2;
+    } else {
+        w = r_h * img.cols;
+        h = kInputH;
+        x = (kInputW - w) / 2;
+        y = 0;
+    }
+    cv::Rect r(x, y, w, h);
+    cv::Mat res;
+    cv::resize(mask(r), res, img.size());
+    return res;
+}
+
+void draw_mask_bbox(cv::Mat& img, std::vector<Detection>& dets, std::vector<cv::Mat>& masks,
+                    std::unordered_map<int, std::string>& labels_map) {
+    static std::vector<uint32_t> colors = {0xFF3838, 0xFF9D97, 0xFF701F, 0xFFB21D, 0xCFD231, 0x48F90A, 0x92CC17,
+                                           0x3DDB86, 0x1A9334, 0x00D4BB, 0x2C99A8, 0x00C2FF, 0x344593, 0x6473FF,
+                                           0x0018EC, 0x8438FF, 0x520085, 0xCB38FF, 0xFF95C8, 0xFF37C7};
+    for (size_t i = 0; i < dets.size(); i++) {
+        cv::Mat img_mask = scale_mask(masks[i], img);
+        auto color = colors[(int)dets[i].class_id % colors.size()];
+        auto bgr = cv::Scalar(color & 0xFF, color >> 8 & 0xFF, color >> 16 & 0xFF);
+
+        cv::Rect r = get_rect(img, dets[i].bbox);
+        for (int x = r.x; x < r.x + r.width; x++) {
+            for (int y = r.y; y < r.y + r.height; y++) {
+                float val = img_mask.at<float>(y, x);
+                if (val <= 0.5)
+                    continue;
+                img.at<cv::Vec3b>(y, x)[0] = img.at<cv::Vec3b>(y, x)[0] / 2 + bgr[0] / 2;
+                img.at<cv::Vec3b>(y, x)[1] = img.at<cv::Vec3b>(y, x)[1] / 2 + bgr[1] / 2;
+                img.at<cv::Vec3b>(y, x)[2] = img.at<cv::Vec3b>(y, x)[2] / 2 + bgr[2] / 2;
+            }
+        }
+
+        cv::rectangle(img, r, bgr, 2);
+
+        // Get the size of the text
+        cv::Size textSize =
+                cv::getTextSize(labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
+                                cv::FONT_HERSHEY_PLAIN, 1.2, 2, NULL);
+        // Set the top left corner of the rectangle
+        cv::Point topLeft(r.x, r.y - textSize.height);
+
+        // Set the bottom right corner of the rectangle
+        cv::Point bottomRight(r.x + textSize.width, r.y + textSize.height);
+
+        // Set the thickness of the rectangle lines
+        int lineThickness = 2;
+
+        // Draw the rectangle on the image
+        cv::rectangle(img, topLeft, bottomRight, bgr, -1);
+
+        cv::putText(img, labels_map[(int)dets[i].class_id] + " " + to_string_with_precision(dets[i].conf),
+                    cv::Point(r.x, r.y + 4), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar::all(0xFF), 2);
+    }
+}
diff --git a/src/postprocess.cu b/src/postprocess.cu
new file mode 100644
index 0000000..3cae042
--- /dev/null
+++ b/src/postprocess.cu
@@ -0,0 +1,84 @@
+//
+// Created by lindsay on 23-7-17.
+//
+#include "types.h"
+#include "postprocess.h"
+
+static __global__ void
+decode_kernel(float *predict, int num_bboxes, float confidence_threshold, float *parray, int max_objects) {
+    float count = predict[0];
+    int position = (blockDim.x * blockIdx.x + threadIdx.x);
+    if (position >= count) return;
+
+    float *pitem = predict + 1 + position * (sizeof(Detection) / sizeof(float));
+    int index = atomicAdd(parray, 1);
+    if (index >= max_objects) return;
+
+    float confidence = pitem[4];
+    if (confidence < confidence_threshold) return;
+
+    float left = pitem[0];
+    float top = pitem[1];
+    float right = pitem[2];
+    float bottom = pitem[3];
+    float label = pitem[5];
+
+    float *pout_item = parray + 1 + index * bbox_element;
+    *pout_item++ = left;
+    *pout_item++ = top;
+    *pout_item++ = right;
+    *pout_item++ = bottom;
+    *pout_item++ = confidence;
+    *pout_item++ = label;
+    *pout_item++ = 1;  // 1 = keep, 0 = ignore
+}
+
+static __device__ float
+box_iou(float aleft, float atop, float aright, float abottom, float bleft, float btop, float bright, float bbottom) {
+    float cleft = max(aleft, bleft);
+    float ctop = max(atop, btop);
+    float cright = min(aright, bright);
+    float cbottom = min(abottom, bbottom);
+    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
+    if (c_area == 0.0f) return 0.0f;
+
+    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
+    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
+    return c_area / (a_area + b_area - c_area);
+}
+
+static __global__ void nms_kernel(float *bboxes, int max_objects, float threshold) {
+    int position = (blockDim.x * blockIdx.x + threadIdx.x);
+    int count = bboxes[0];
+    if (position >= count) return;
+
+    float *pcurrent = bboxes + 1 + position * bbox_element;
+    for (int i = 0; i < count; ++i) {
+        float *pitem = bboxes + 1 + i * bbox_element;
+        if (i == position || pcurrent[5] != pitem[5]) continue;
+        if (pitem[4] >= pcurrent[4]) {
+            if (pitem[4] == pcurrent[4] && i < position) continue;
+            float iou = box_iou(
+                pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
+                pitem[0], pitem[1], pitem[2], pitem[3]
+            );
+            if (iou > threshold) {
+                pcurrent[6] = 0;
+                return;
+            }
+        }
+    }
+}
+
+void cuda_decode(float *predict, int num_bboxes, float confidence_threshold, float *parray, int max_objects,
+                 cudaStream_t stream) {
+    int block = 256;
+    int grid = ceil(num_bboxes / (float)block);
+    decode_kernel<<<grid, block, 0, stream>>>((float*)predict, num_bboxes, confidence_threshold, parray, max_objects);
+}
+
+void cuda_nms(float *parray, float nms_threshold, int max_objects, cudaStream_t stream) {
+    int block = max_objects < 256 ? max_objects : 256;
+    int grid = ceil(max_objects / (float)block);
+    nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
+}
diff --git a/src/preprocess.cu b/src/preprocess.cu
new file mode 100644
index 0000000..14d9e77
--- /dev/null
+++ b/src/preprocess.cu
@@ -0,0 +1,155 @@
+#include "preprocess.h"
+#include "cuda_utils.h"
+
+static uint8_t *img_buffer_host = nullptr;
+static uint8_t *img_buffer_device = nullptr;
+
+
+__global__ void
+warpaffine_kernel(uint8_t *src, int src_line_size, int src_width, int src_height, float *dst, int dst_width,
+                  int dst_height, uint8_t const_value_st, AffineMatrix d2s, int edge) {
+    int position = blockDim.x * blockIdx.x + threadIdx.x;
+    if (position >= edge) return;
+
+    float m_x1 = d2s.value[0];
+    float m_y1 = d2s.value[1];
+    float m_z1 = d2s.value[2];
+    float m_x2 = d2s.value[3];
+    float m_y2 = d2s.value[4];
+    float m_z2 = d2s.value[5];
+
+    int dx = position % dst_width;
+    int dy = position / dst_width;
+    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
+    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
+    float c0, c1, c2;
+
+    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
+        // out of range
+        c0 = const_value_st;
+        c1 = const_value_st;
+        c2 = const_value_st;
+    } else {
+        int y_low = floorf(src_y);
+        int x_low = floorf(src_x);
+        int y_high = y_low + 1;
+        int x_high = x_low + 1;
+
+        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+        float ly = src_y - y_low;
+        float lx = src_x - x_low;
+        float hy = 1 - ly;
+        float hx = 1 - lx;
+        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+        uint8_t *v1 = const_value;
+        uint8_t *v2 = const_value;
+        uint8_t *v3 = const_value;
+        uint8_t *v4 = const_value;
+
+        if (y_low >= 0) {
+            if (x_low >= 0)
+                v1 = src + y_low * src_line_size + x_low * 3;
+
+            if (x_high < src_width)
+                v2 = src + y_low * src_line_size + x_high * 3;
+        }
+
+        if (y_high < src_height) {
+            if (x_low >= 0)
+                v3 = src + y_high * src_line_size + x_low * 3;
+
+            if (x_high < src_width)
+                v4 = src + y_high * src_line_size + x_high * 3;
+        }
+
+        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
+        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
+        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
+    }
+
+    // bgr to rgb
+    float t = c2;
+    c2 = c0;
+    c0 = t;
+
+    // normalization
+    c0 = c0 / 255.0f;
+    c1 = c1 / 255.0f;
+    c2 = c2 / 255.0f;
+
+    // rgbrgbrgb to rrrgggbbb
+    int area = dst_width * dst_height;
+    float *pdst_c0 = dst + dy * dst_width + dx;
+    float *pdst_c1 = pdst_c0 + area;
+    float *pdst_c2 = pdst_c1 + area;
+    *pdst_c0 = c0;
+    *pdst_c1 = c1;
+    *pdst_c2 = c2;
+}
+
+
+
+
+void cuda_preprocess(uint8_t *src, int src_width, int src_height, float *dst, int dst_width, int dst_height,
+                     cudaStream_t stream) {
+    int img_size = src_width * src_height * 3;
+    // copy data to pinned memory
+    memcpy(img_buffer_host, src, img_size);
+    // copy data to device memory
+    CUDA_CHECK(cudaMemcpyAsync(img_buffer_device, img_buffer_host, img_size, cudaMemcpyHostToDevice, stream));
+
+    AffineMatrix s2d, d2s;
+    float scale = std::min(dst_height / (float) src_height, dst_width / (float) src_width);
+
+    s2d.value[0] = scale;
+    s2d.value[1] = 0;
+    s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
+    s2d.value[3] = 0;
+    s2d.value[4] = scale;
+    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
+    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
+    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
+    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
+
+    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
+
+    int jobs = dst_height * dst_width;
+    int threads = 256;
+    int blocks = ceil(jobs / (float) threads);
+    warpaffine_kernel<<<blocks, threads, 0, stream>>>(
+            img_buffer_device, src_width * 3, src_width,
+            src_height, dst, dst_width,
+            dst_height, 128, d2s, jobs);
+}
+
+
+void cuda_batch_preprocess(std::vector<cv::Mat> &img_batch,
+                           float *dst, int dst_width, int dst_height,
+                           cudaStream_t stream) {
+    int dst_size = dst_width * dst_height * 3;
+    for (size_t i = 0; i < img_batch.size(); i++) {
+        cuda_preprocess(img_batch[i].ptr(), img_batch[i].cols, img_batch[i].rows, &dst[dst_size * i], dst_width,
+                        dst_height, stream);
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+}
+
+
+
+
+
+void cuda_preprocess_init(int max_image_size) {
+    // prepare input data in pinned memory
+    CUDA_CHECK(cudaMallocHost((void **) &img_buffer_host, max_image_size * 3));
+    // prepare input data in device memory
+    CUDA_CHECK(cudaMalloc((void **) &img_buffer_device, max_image_size * 3));
+}
+
+void cuda_preprocess_destroy() {
+    CUDA_CHECK(cudaFree(img_buffer_device));
+    CUDA_CHECK(cudaFreeHost(img_buffer_host));
+}
+
+
+
+
diff --git a/yolov8_cls.cpp b/yolov8_cls.cpp
new file mode 100644
index 0000000..1ab490b
--- /dev/null
+++ b/yolov8_cls.cpp
@@ -0,0 +1,305 @@
+#include "cuda_utils.h"
+#include "logging.h"
+#include "utils.h"
+#include "model.h"
+#include "config.h"
+#include "calibrator.h"
+
+#include <iostream>
+#include <chrono>
+#include <cmath>
+#include <numeric>
+#include <opencv2/opencv.hpp>
+
+using namespace nvinfer1;
+
+static Logger gLogger;
+const static int kOutputSize = kClsNumClass;
+
+void batch_preprocess(std::vector<cv::Mat> &imgs, float *output, int dst_width = 224, int dst_height = 224) {
+    for (size_t b = 0; b < imgs.size(); b++) {
+        int h = imgs[b].rows;
+        int w = imgs[b].cols;
+        int m = std::min(h, w);
+        int top = (h - m) / 2;
+        int left = (w - m) / 2;
+        cv::Mat img = imgs[b](cv::Rect(left, top, m, m));
+        cv::resize(img, img, cv::Size(dst_width, dst_height), 0, 0, cv::INTER_LINEAR);
+        cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+        img.convertTo(img, CV_32F, 1 / 255.0);
+
+        std::vector<cv::Mat> channels(3);
+        cv::split(img, channels);
+
+        // CHW format
+        for (int c = 0; c < 3; ++c) {
+            int i = 0;
+            for (int row = 0; row < dst_height; ++row) {
+                for (int col = 0; col < dst_width; ++col) {
+                    output[b * 3 * dst_height * dst_width + c * dst_height * dst_width + i] =
+                            channels[c].at<float>(row, col);
+                    ++i;
+                }
+            }
+        }
+    }
+}
+
+std::vector<float> softmax(float *prob, int n) {
+    std::vector<float> res;
+    float sum = 0.0f;
+    float t;
+    for (int i = 0; i < n; i++) {
+        t = expf(prob[i]);
+        res.push_back(t);
+        sum += t;
+    }
+    for (int i = 0; i < n; i++) {
+        res[i] /= sum;
+    }
+    return res;
+}
+
+std::vector<int> topk(const std::vector<float> &vec, int k) {
+    std::vector<int> topk_index;
+    std::vector<size_t> vec_index(vec.size());
+    std::iota(vec_index.begin(), vec_index.end(), 0);
+
+    std::sort(vec_index.begin(), vec_index.end(),
+              [&vec](size_t index_1, size_t index_2) { return vec[index_1] > vec[index_2]; });
+
+    int k_num = std::min<int>(vec.size(), k);
+
+    for (int i = 0; i < k_num; ++i) {
+        topk_index.push_back(vec_index[i]);
+    }
+
+    return topk_index;
+}
+
+std::vector<std::string> read_classes(std::string file_name) {
+    std::vector<std::string> classes;
+    std::ifstream ifs(file_name, std::ios::in);
+    if (!ifs.is_open()) {
+        std::cerr << file_name << " is not found, pls refer to README and download it." << std::endl;
+        assert(0);
+    }
+    std::string s;
+    while (std::getline(ifs, s)) {
+        classes.push_back(s);
+    }
+    ifs.close();
+    return classes;
+}
+
+bool
+parse_args(int argc, char **argv, std::string &wts, std::string &engine, float &gd, float &gw, std::string &img_dir) {
+    if (argc < 4) return false;
+    if (std::string(argv[1]) == "-s" && (argc == 5)) {
+        wts = std::string(argv[2]);
+        engine = std::string(argv[3]);
+        auto net = std::string(argv[4]);
+        if (net[0] == 'n') {
+            gd = 0.33;
+            gw = 0.25;
+        } else if (net[0] == 's') {
+            gd = 0.33;
+            gw = 0.50;
+        } else if (net[0] == 'm') {
+            gd = 0.67;
+            gw = 0.75;
+        } else if (net[0] == 'l') {
+            gd = 1.0;
+            gw = 1.0;
+        } else if (net[0] == 'x') {
+            gd = 1.0;
+            gw = 1.25;
+        } else {
+            return false;
+        }
+    } else if (std::string(argv[1]) == "-d" && argc == 4) {
+        engine = std::string(argv[2]);
+        img_dir = std::string(argv[3]);
+    } else {
+        return false;
+    }
+    return true;
+}
+
+void prepare_buffers(ICudaEngine *engine, float **gpu_input_buffer, float **gpu_output_buffer, float **cpu_input_buffer,
+                     float **output_buffer_host) {
+    assert(engine->getNbIOTensors() == 2);
+    // In order to bind the buffers, we need to know the names of the input and output tensors.
+    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+    TensorIOMode input_mode = engine->getTensorIOMode(kInputTensorName);
+    if (input_mode != TensorIOMode::kINPUT) {
+        std::cerr << kInputTensorName << " should be input tensor" << std::endl;
+        assert(false);
+    }
+    TensorIOMode output_mode = engine->getTensorIOMode(kOutputTensorName);
+    if (output_mode != TensorIOMode::kOUTPUT) {
+        std::cerr << kOutputTensorName << " should be output tensor" << std::endl;
+        assert(false);
+    }
+    // Create GPU buffers on device
+    CUDA_CHECK(cudaMalloc((void **) gpu_input_buffer, kBatchSize * 3 * kClsInputH * kClsInputW * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void **) gpu_output_buffer, kBatchSize * kOutputSize * sizeof(float)));
+
+    *cpu_input_buffer = new float[kBatchSize * 3 * kClsInputH * kClsInputW];
+    *output_buffer_host = new float[kBatchSize * kOutputSize];
+}
+
+void
+infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *input, float *output, int batchSize) {
+    CUDA_CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * kClsInputH * kClsInputW * sizeof(float),
+                               cudaMemcpyHostToDevice, stream));
+    context.setInputTensorAddress(kInputTensorName, buffers[0]);
+    context.setOutputTensorAddress(kOutputTensorName, buffers[1]);
+    context.enqueueV3(stream);
+    CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
+                               stream));
+    cudaStreamSynchronize(stream);
+}
+
+void
+serialize_engine(unsigned int max_batchsize, float &gd, float &gw, std::string &wts_name, std::string &engine_name) {
+    // Create builder
+    IBuilder *builder = createInferBuilder(gLogger);
+    IBuilderConfig *config = builder->createBuilderConfig();
+    // Create model to populate the network, then set the outputs and create an engine
+    IHostMemory *serialized_engine = nullptr;
+    //engine = buildEngineYolov8Cls(max_batchsize, builder, config, DataType::kFLOAT, gd, gw, wts_name);
+    serialized_engine = buildEngineYolov8Cls(builder, config, DataType::kFLOAT, wts_name, gd, gw);
+    assert(serialized_engine);
+    // Save engine to file
+    std::ofstream p(engine_name, std::ios::binary);
+    if (!p) {
+        std::cerr << "Could not open plan output file" << std::endl;
+        assert(false);
+    }
+    p.write(reinterpret_cast<const char *>(serialized_engine->data()), serialized_engine->size());
+
+    // Close everything down
+    delete serialized_engine;
+    delete config;
+    delete builder;
+}
+
+void
+deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngine **engine, IExecutionContext **context) {
+    std::ifstream file(engine_name, std::ios::binary);
+    if (!file.good()) {
+        std::cerr << "read " << engine_name << " error!" << std::endl;
+        assert(false);
+    }
+    size_t size = 0;
+    file.seekg(0, file.end);
+    size = file.tellg();
+    file.seekg(0, file.beg);
+    char *serialized_engine = new char[size];
+    assert(serialized_engine);
+    file.read(serialized_engine, size);
+    file.close();
+
+    *runtime = createInferRuntime(gLogger);
+    assert(*runtime);
+    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
+    assert(*engine);
+    *context = (*engine)->createExecutionContext();
+    assert(*context);
+    delete[] serialized_engine;
+}
+
+int main(int argc, char **argv) {
+    // -s ../models/yolov8n-cls.wts ../models/yolov8n-cls.fp32.trt n
+    // -d ../models/yolov8n-cls.fp32.trt ../images
+    cudaSetDevice(kGpuId);
+
+    std::string wts_name = "";
+    std::string engine_name = "";
+    float gd = 0.0f, gw = 0.0f;
+    std::string img_dir;
+
+    if (!parse_args(argc, argv, wts_name, engine_name, gd, gw, img_dir)) {
+        std::cerr << "arguments not right!" << std::endl;
+        std::cerr << "./yolov8_cls -s [.wts] [.engine] [n/s/m/l/x or c gd gw]  // serialize model to plan file"
+                  << std::endl;
+        std::cerr << "./yolov8_cls -d [.engine] ../samples  // deserialize plan file and run inference" << std::endl;
+        return -1;
+    }
+
+    // Create a model using the API directly and serialize it to a file
+    if (!wts_name.empty()) {
+        serialize_engine(kBatchSize, gd, gw, wts_name, engine_name);
+        return 0;
+    }
+
+    // Deserialize the engine from file
+    IRuntime *runtime = nullptr;
+    ICudaEngine *engine = nullptr;
+    IExecutionContext *context = nullptr;
+    deserialize_engine(engine_name, &runtime, &engine, &context);
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+
+    // Prepare cpu and gpu buffers
+    float *device_buffers[2];
+    float *cpu_input_buffer = nullptr;
+    float *output_buffer_host = nullptr;
+    prepare_buffers(engine, &device_buffers[0], &device_buffers[1], &cpu_input_buffer, &output_buffer_host);
+
+    // Read images from directory
+    std::vector<std::string> file_names;
+    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
+        std::cerr << "read_files_in_dir failed." << std::endl;
+        return -1;
+    }
+
+    // Read imagenet labels
+    auto classes = read_classes("imagenet_classes.txt");
+
+    // batch predict
+    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
+        // Get a batch of images
+        std::vector<cv::Mat> img_batch;
+        std::vector<std::string> img_name_batch;
+        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
+            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
+            img_batch.push_back(img);
+            img_name_batch.push_back(file_names[j]);
+        }
+
+        // Preprocess
+        batch_preprocess(img_batch, cpu_input_buffer);
+
+        // Run inference
+        auto start = std::chrono::system_clock::now();
+        infer(*context, stream, (void **) device_buffers, cpu_input_buffer, output_buffer_host, kBatchSize);
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+                  << "ms" << std::endl;
+
+        // Postprocess and get top-k result
+        for (size_t b = 0; b < img_name_batch.size(); b++) {
+            float *p = &output_buffer_host[b * kOutputSize];
+            auto res = softmax(p, kOutputSize);
+            auto topk_idx = topk(res, 3);
+            std::cout << img_name_batch[b] << std::endl;
+            for (auto idx: topk_idx) {
+                std::cout << "  " << classes[idx] << " " << res[idx] << std::endl;
+            }
+        }
+    }
+
+    // Release stream and buffers
+    cudaStreamDestroy(stream);
+    CUDA_CHECK(cudaFree(device_buffers[0]));
+    CUDA_CHECK(cudaFree(device_buffers[1]));
+    delete[] cpu_input_buffer;
+    delete[] output_buffer_host;
+    // Destroy the engine
+    delete context;
+    delete engine;
+    delete runtime;
+    return 0;
+}
diff --git a/yolov8_cls_trt.py b/yolov8_cls_trt.py
new file mode 100644
index 0000000..514d1f5
--- /dev/null
+++ b/yolov8_cls_trt.py
@@ -0,0 +1,288 @@
+"""
+An example that uses TensorRT's Python api to make inferences.
+"""
+import os
+import shutil
+import sys
+import threading
+import time
+import cv2
+import numpy as np
+import torch
+import pycuda.autoinit  # noqa: F401
+import pycuda.driver as cuda
+import tensorrt as trt
+
+
+def get_img_path_batches(batch_size, img_dir):
+    ret = []
+    batch = []
+    for root, dirs, files in os.walk(img_dir):
+        for name in files:
+            if len(batch) == batch_size:
+                ret.append(batch)
+                batch = []
+            batch.append(os.path.join(root, name))
+    if len(batch) > 0:
+        ret.append(batch)
+    return ret
+
+
+with open("build/imagenet_classes.txt") as f:
+    classes = [line.strip() for line in f.readlines()]
+
+
+class YoLov8TRT(object):
+    """
+    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
+    """
+
+    def __init__(self, engine_file_path):
+        # Create a Context on this device,
+        self.ctx = cuda.Device(0).make_context()
+        stream = cuda.Stream()
+        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+        runtime = trt.Runtime(TRT_LOGGER)
+
+        # Deserialize the engine from file
+        with open(engine_file_path, "rb") as f:
+            engine = runtime.deserialize_cuda_engine(f.read())
+        context = engine.create_execution_context()
+
+        host_inputs = []
+        cuda_inputs = []
+        host_outputs = []
+        cuda_outputs = []
+        input_binding_names = []
+        output_binding_names = []
+
+        for binding_name in engine:
+            shape = engine.get_tensor_shape(binding_name)
+            print('binding_name:', binding_name, shape)
+            size = trt.volume(shape)
+            dtype = trt.nptype(engine.get_tensor_dtype(binding_name))
+            # Allocate host and device buffers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
+            # Append the device buffer to device bindings.
+            # Append to the appropriate list.
+            if engine.get_tensor_mode(binding_name) == trt.TensorIOMode.INPUT:
+                input_binding_names.append(binding_name)
+                self.input_w = shape[-1]
+                self.input_h = shape[-2]
+                host_inputs.append(host_mem)
+                cuda_inputs.append(cuda_mem)
+            elif engine.get_tensor_mode(binding_name) == trt.TensorIOMode.OUTPUT:
+                output_binding_names.append(binding_name)
+                host_outputs.append(host_mem)
+                cuda_outputs.append(cuda_mem)
+            else:
+                print('unknow:', binding_name)
+
+        # Store
+        self.stream = stream
+        self.context = context
+        self.engine = engine
+        self.host_inputs = host_inputs
+        self.cuda_inputs = cuda_inputs
+        self.host_outputs = host_outputs
+        self.cuda_outputs = cuda_outputs
+        self.input_binding_names = input_binding_names
+        self.output_binding_names = output_binding_names
+        self.batch_size = engine.get_tensor_shape(input_binding_names[0])[0]
+        print('batch_size:', self.batch_size)
+
+    def infer(self, raw_image_generator):
+        threading.Thread.__init__(self)
+        # Make self the active context, pushing it on top of the context stack.
+        self.ctx.push()
+        # Restore
+        stream = self.stream
+        context = self.context
+        host_inputs = self.host_inputs
+        cuda_inputs = self.cuda_inputs
+        host_outputs = self.host_outputs
+        cuda_outputs = self.cuda_outputs
+        input_binding_names = self.input_binding_names
+        output_binding_names = self.output_binding_names
+        # Do image preprocess
+        batch_image_raw = []
+        batch_input_image = np.empty(
+            shape=[self.batch_size, 3, self.input_h, self.input_w])
+        for i, image_raw in enumerate(raw_image_generator):
+            batch_image_raw.append(image_raw)
+            input_image = self.preprocess_cls_image(image_raw)
+            np.copyto(batch_input_image[i], input_image)
+        batch_input_image = np.ascontiguousarray(batch_input_image)
+
+        # Copy input image to host buffer
+        np.copyto(host_inputs[0], batch_input_image.ravel())
+        start = time.time()
+        # Transfer input data  to the GPU.
+        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
+        # Run inference.
+        context.set_tensor_address(input_binding_names[0], cuda_inputs[0])
+        context.set_tensor_address(output_binding_names[0], cuda_outputs[0])
+        context.execute_async_v3(stream_handle=stream.handle)
+        # Transfer predictions back from the GPU.
+        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
+        # Synchronize the stream
+        stream.synchronize()
+        end = time.time()
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+        # Here we use the first row of output in that batch_size = 1
+        output = host_outputs[0]
+        # Do postprocess
+        for i in range(self.batch_size):
+            classes_ls, predicted_conf_ls, category_id_ls = self.postprocess_cls(
+                output)
+            cv2.putText(batch_image_raw[i], str(
+                classes_ls), (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1, cv2.LINE_AA)
+            print(classes_ls, predicted_conf_ls)
+        return batch_image_raw, end - start
+
+    def destroy(self):
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+
+    def get_raw_image(self, image_path_batch):
+        """
+        description: Read an image from image path
+        """
+        for img_path in image_path_batch:
+            yield cv2.imread(img_path)
+
+    def get_raw_image_zeros(self, image_path_batch=None):
+        """
+        description: Ready data for warmup
+        """
+        for _ in range(self.batch_size):
+            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
+
+    def preprocess_cls_image(self, raw_bgr_image, dst_width=224, dst_height=224):
+
+        """
+            description: Convert BGR image to RGB,
+                         crop the center square frame,
+                         resize it to target size, normalize to [0,1],
+                         transform to NCHW format.
+            param:
+                raw_bgr_image: numpy array, raw BGR image
+                dst_width: int, target image width
+                dst_height: int, target image height
+            return:
+                image:  the processed image
+                image_raw: the original image
+                h: original height
+                w: original width
+        """
+        image_raw = raw_bgr_image
+        h, w, c = image_raw.shape
+        # Crop the center square frame
+        m = min(h, w)
+        top = (h - m) // 2
+        left = (w - m) // 2
+        image = raw_bgr_image[top:top + m, left:left + m]
+
+        # Resize the image with target size while maintaining ratio
+        image = cv2.resize(image, (dst_width, dst_height), interpolation=cv2.INTER_LINEAR)
+
+        # Convert BGR to RGB
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        # Normalize to [0,1]
+        image = image.astype(np.float32) / 255.0
+
+        # HWC to CHW format
+        image = image.transpose(2, 0, 1)
+
+        # CHW to NCHW format (add batch dimension)
+        image = np.expand_dims(image, axis=0)
+
+        # Convert the image to row-major order, also known as "C order"
+        image = np.ascontiguousarray(image)
+
+        batch_data = np.expand_dims(image, axis=0)
+
+        return batch_data
+
+    def postprocess_cls(self, output_data):
+        classes_ls = []
+        predicted_conf_ls = []
+        category_id_ls = []
+        output_data = output_data.reshape(self.batch_size, -1)
+        output_data = torch.Tensor(output_data)
+        p = torch.nn.functional.softmax(output_data, dim=1)
+        score, index = torch.topk(p, 3)
+        for ind in range(index.shape[0]):
+            input_category_id = index[ind][0].item()  # 716
+            category_id_ls.append(input_category_id)
+            predicted_confidence = score[ind][0].item()
+            predicted_conf_ls.append(predicted_confidence)
+            classes_ls.append(classes[input_category_id])
+        return classes_ls, predicted_conf_ls, category_id_ls
+
+
+class inferThread(threading.Thread):
+    def __init__(self, yolov8_wrapper, image_path_batch):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+        self.image_path_batch = image_path_batch
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(
+            self.yolov8_wrapper.get_raw_image(self.image_path_batch))
+        for i, img_path in enumerate(self.image_path_batch):
+            parent, filename = os.path.split(img_path)
+            save_name = os.path.join('output', filename)
+            # Save image
+            cv2.imwrite(save_name, batch_image_raw[i])
+        print('input->{}, time->{:.2f}ms, saving into output/'.format(
+            self.image_path_batch, use_time * 1000))
+
+
+class warmUpThread(threading.Thread):
+    def __init__(self, yolov8_wrapper):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(
+            self.yolov8_wrapper.get_raw_image_zeros())
+        print(
+            'warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
+
+
+if __name__ == "__main__":
+    # load custom plugin and engine
+    engine_file_path = "./yolov8x-cls-fp32.engine"
+
+    if len(sys.argv) > 1:
+        engine_file_path = sys.argv[1]
+
+    if os.path.exists('output/'):
+        shutil.rmtree('output/')
+    os.makedirs('output/')
+    # a YoLov8TRT instance
+    yolov8_wrapper = YoLov8TRT(engine_file_path)
+    try:
+        print('batch size is', yolov8_wrapper.batch_size)
+
+        image_dir = "images/"
+        image_path_batches = get_img_path_batches(
+            yolov8_wrapper.batch_size, image_dir)
+
+        for i in range(10):
+            # create a new thread to do warm_up
+            thread1 = warmUpThread(yolov8_wrapper)
+            thread1.start()
+            thread1.join()
+        for batch in image_path_batches:
+            # create a new thread to do inference
+            thread1 = inferThread(yolov8_wrapper, batch)
+            thread1.start()
+            thread1.join()
+    finally:
+        # destroy the instance
+        yolov8_wrapper.destroy()
diff --git a/yolov8_det.cpp b/yolov8_det.cpp
new file mode 100644
index 0000000..552df96
--- /dev/null
+++ b/yolov8_det.cpp
@@ -0,0 +1,301 @@
+#include <fstream>
+#include <iostream>
+#include <opencv2/opencv.hpp>
+#include "cuda_utils.h"
+#include "logging.h"
+#include "model.h"
+#include "postprocess.h"
+#include "preprocess.h"
+#include "utils.h"
+
+Logger gLogger;
+using namespace nvinfer1;
+const int kOutputSize = kMaxNumOutputBbox * sizeof(Detection) / sizeof(float) + 1;
+
+void serialize_engine(std::string &wts_name, std::string &engine_name, int &is_p, std::string &sub_type, float &gd,
+                      float &gw, int &max_channels) {
+    IBuilder *builder = createInferBuilder(gLogger);
+    IBuilderConfig *config = builder->createBuilderConfig();
+    IHostMemory *serialized_engine = nullptr;
+
+    if (is_p == 6) {
+        serialized_engine = buildEngineYolov8DetP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    } else if (is_p == 2) {
+        serialized_engine = buildEngineYolov8DetP2(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    } else {
+        serialized_engine = buildEngineYolov8Det(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    }
+
+    assert(serialized_engine);
+    std::ofstream p(engine_name, std::ios::binary);
+    if (!p) {
+        std::cout << "could not open plan output file" << std::endl;
+        assert(false);
+    }
+    p.write(reinterpret_cast<const char *>(serialized_engine->data()), serialized_engine->size());
+
+    delete serialized_engine;
+    delete config;
+    delete builder;
+}
+
+void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngine **engine,
+                        IExecutionContext **context) {
+    std::ifstream file(engine_name, std::ios::binary);
+    if (!file.good()) {
+        std::cerr << "read " << engine_name << " error!" << std::endl;
+        assert(false);
+    }
+    size_t size = 0;
+    file.seekg(0, file.end);
+    size = file.tellg();
+    file.seekg(0, file.beg);
+    char *serialized_engine = new char[size];
+    assert(serialized_engine);
+    file.read(serialized_engine, size);
+    file.close();
+
+    *runtime = createInferRuntime(gLogger);
+    assert(*runtime);
+    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
+    assert(*engine);
+    *context = (*engine)->createExecutionContext();
+    assert(*context);
+    delete[] serialized_engine;
+}
+
+void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **output_buffer_device,
+                    float **output_buffer_host, float **decode_ptr_host, float **decode_ptr_device,
+                    std::string cuda_post_process) {
+    assert(engine->getNbIOTensors() == 2);
+    // In order to bind the buffers, we need to know the names of the input and output tensors.
+    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+    TensorIOMode input_mode = engine->getTensorIOMode(kInputTensorName);
+    if (input_mode != TensorIOMode::kINPUT) {
+        std::cerr << kInputTensorName << " should be input tensor" << std::endl;
+        assert(false);
+    }
+    TensorIOMode output_mode = engine->getTensorIOMode(kOutputTensorName);
+    if (output_mode != TensorIOMode::kOUTPUT) {
+        std::cerr << kOutputTensorName << " should be output tensor" << std::endl;
+        assert(false);
+    }
+    // Create GPU buffers on device
+    CUDA_CHECK(cudaMalloc((void **) input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void **) output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
+    if (cuda_post_process == "c") {
+        *output_buffer_host = new float[kBatchSize * kOutputSize];
+    } else if (cuda_post_process == "g") {
+        if (kBatchSize > 1) {
+            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
+            exit(0);
+        }
+        // Allocate memory for decode_ptr_host and copy to device
+        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
+        CUDA_CHECK(cudaMalloc((void **) decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
+    }
+}
+
+void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, int batchsize,
+           float *decode_ptr_host, float *decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
+    // infer on the batch asynchronously, and DMA output back to host
+    auto start = std::chrono::system_clock::now();
+    context.setInputTensorAddress(kInputTensorName, buffers[0]);
+    context.setOutputTensorAddress(kOutputTensorName, buffers[1]);
+    context.enqueueV3(stream);
+    if (cuda_post_process == "c") {
+        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
+                                   stream));
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+                  << "ms" << std::endl;
+    } else if (cuda_post_process == "g") {
+        CUDA_CHECK(
+                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
+        cuda_decode((float *) buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
+        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
+        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
+                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
+                                   stream));
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference and gpu postprocess time: "
+                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+    }
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, int &is_p, std::string &img_dir,
+                std::string &sub_type, std::string &cuda_post_process, float &gd, float &gw, int &max_channels) {
+    if (argc < 4)
+        return false;
+    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
+        wts = std::string(argv[2]);
+        engine = std::string(argv[3]);
+        auto sub_type = std::string(argv[4]);
+
+        if (sub_type[0] == 'n') {
+            gd = 0.33;
+            gw = 0.25;
+            max_channels = 1024;
+        } else if (sub_type[0] == 's') {
+            gd = 0.33;
+            gw = 0.50;
+            max_channels = 1024;
+        } else if (sub_type[0] == 'm') {
+            gd = 0.67;
+            gw = 0.75;
+            max_channels = 576;
+        } else if (sub_type[0] == 'l') {
+            gd = 1.0;
+            gw = 1.0;
+            max_channels = 512;
+        } else if (sub_type[0] == 'x') {
+            gd = 1.0;
+            gw = 1.25;
+            max_channels = 640;
+        } else {
+            return false;
+        }
+        if (sub_type.size() == 2 && sub_type[1] == '6') {
+            is_p = 6;
+        } else if (sub_type.size() == 2 && sub_type[1] == '2') {
+            is_p = 2;
+        }
+    } else if (std::string(argv[1]) == "-d" && argc == 5) {
+        engine = std::string(argv[2]);
+        img_dir = std::string(argv[3]);
+        cuda_post_process = std::string(argv[4]);
+    } else {
+        return false;
+    }
+    return true;
+}
+
+int main(int argc, char **argv) {
+    // -s ../models/yolov8n.wts ../models/yolov8n.fp32.trt n
+    // -d ../models/yolov8n.fp32.trt ../images c
+    cudaSetDevice(kGpuId);
+    std::string wts_name = "";
+    std::string engine_name = "";
+    std::string img_dir;
+    std::string sub_type = "";
+    std::string cuda_post_process = "";
+    int model_bboxes;
+    int is_p = 0;
+    float gd = 0.0f, gw = 0.0f;
+    int max_channels = 0;
+
+    if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
+                    max_channels)) {
+        std::cerr << "Arguments not right!" << std::endl;
+        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to "
+                     "plan file"
+                  << std::endl;
+        std::cerr << "./yolov8 -d [.engine] ../samples  [c/g]// deserialize plan file and run inference" << std::endl;
+        return -1;
+    }
+
+    // Create a model using the API directly and serialize it to a file
+    if (!wts_name.empty()) {
+        serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
+        return 0;
+    }
+
+    // Deserialize the engine from file
+    IRuntime *runtime = nullptr;
+    ICudaEngine *engine = nullptr;
+    IExecutionContext *context = nullptr;
+    deserialize_engine(engine_name, &runtime, &engine, &context);
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    cuda_preprocess_init(kMaxInputImageSize);
+    auto out_dims = engine->getTensorShape(kOutputTensorName);
+    model_bboxes = out_dims.d[1];
+    // Prepare cpu and gpu buffers
+    float *device_buffers[2];
+    float *output_buffer_host = nullptr;
+    float *decode_ptr_host = nullptr;
+    float *decode_ptr_device = nullptr;
+
+    // Read images from directory
+    std::vector<std::string> file_names;
+    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
+        std::cerr << "read_files_in_dir failed." << std::endl;
+        return -1;
+    }
+
+    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
+                   &decode_ptr_device, cuda_post_process);
+
+    // batch predict
+    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
+        // Get a batch of images
+        std::vector<cv::Mat> img_batch;
+        std::vector<std::string> img_name_batch;
+        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
+            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
+            if (img.empty()) {
+                std::cerr << "Fatal error: image cannot open!" << std::endl;
+                return -1;
+            }
+            img_batch.push_back(img);
+            img_name_batch.push_back(file_names[j]);
+        }
+        // Preprocess
+        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
+        // Run inference
+        infer(*context, stream, (void **) device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
+              decode_ptr_device, model_bboxes, cuda_post_process);
+
+        std::vector<std::vector<Detection>> res_batch;
+        if (cuda_post_process == "c") {
+            // NMS
+            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
+        } else if (cuda_post_process == "g") {
+            //Process gpu decode and nms results
+            batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
+        }
+
+        // print results
+        for (size_t j = 0; j < res_batch.size(); j++) {
+            for (size_t k = 0; k < res_batch[j].size(); k++) {
+                std::cout << "image: " << img_name_batch[j] << ", bbox: " << res_batch[j][k].bbox[0] << ", "
+                          << res_batch[j][k].bbox[1] << ", " << res_batch[j][k].bbox[2] << ", "
+                          << res_batch[j][k].bbox[3] << ", conf: " << res_batch[j][k].conf << ", class_id: "
+                          << res_batch[j][k].class_id << std::endl;
+            }
+        }
+
+        // Draw bounding boxes
+        draw_bbox(img_batch, res_batch);
+        // Save images
+        for (size_t j = 0; j < img_batch.size(); j++) {
+            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
+        }
+    }
+
+    // Release stream and buffers
+    cudaStreamDestroy(stream);
+    CUDA_CHECK(cudaFree(device_buffers[0]));
+    CUDA_CHECK(cudaFree(device_buffers[1]));
+    CUDA_CHECK(cudaFree(decode_ptr_device));
+    delete[] decode_ptr_host;
+    delete[] output_buffer_host;
+    cuda_preprocess_destroy();
+    // Destroy the engine
+    delete context;
+    delete engine;
+    delete runtime;
+
+    // Print histogram of the output distribution
+    //std::cout << "\nOutput:\n\n";
+    //for (unsigned int i = 0; i < kOutputSize; i++)
+    //{
+    //    std::cout << prob[i] << ", ";
+    //    if (i % 10 == 0) std::cout << std::endl;
+    //}
+    //std::cout << std::endl;
+
+    return 0;
+}
diff --git a/yolov8_det_trt.py b/yolov8_det_trt.py
new file mode 100644
index 0000000..81546e8
--- /dev/null
+++ b/yolov8_det_trt.py
@@ -0,0 +1,471 @@
+"""
+An example that uses TensorRT's Python api to make inferences.
+"""
+import ctypes
+import os
+import shutil
+import random
+import sys
+import threading
+import time
+import cv2
+import numpy as np
+import pycuda.autoinit  # noqa: F401
+import pycuda.driver as cuda
+import tensorrt as trt
+
+CONF_THRESH = 0.5
+IOU_THRESHOLD = 0.4
+POSE_NUM = 17 * 3
+DET_NUM = 6
+SEG_NUM = 32
+
+
+def get_img_path_batches(batch_size, img_dir):
+    ret = []
+    batch = []
+    for root, dirs, files in os.walk(img_dir):
+        for name in files:
+            if len(batch) == batch_size:
+                ret.append(batch)
+                batch = []
+            batch.append(os.path.join(root, name))
+    if len(batch) > 0:
+        ret.append(batch)
+    return ret
+
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+    """
+    description: Plots one bounding box on image img,
+                 this function comes from YoLov8 project.
+    param:
+        x:      a box likes [x1,y1,x2,y2]
+        img:    a opencv image object
+        color:  color to draw rectangle, such as (0,255,0)
+        label:  str
+        line_thickness: int
+    return:
+        no return
+
+    """
+    tl = (
+            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
+    )  # line/font thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
+        cv2.putText(
+            img,
+            label,
+            (c1[0], c1[1] - 2),
+            0,
+            tl / 3,
+            [225, 255, 255],
+            thickness=tf,
+            lineType=cv2.LINE_AA,
+        )
+
+
+class YoLov8TRT(object):
+    """
+    description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
+    """
+
+    def __init__(self, engine_file_path):
+        # Create a Context on this device,
+        self.ctx = cuda.Device(0).make_context()
+        stream = cuda.Stream()
+        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+        runtime = trt.Runtime(TRT_LOGGER)
+
+        # Deserialize the engine from file
+        with open(engine_file_path, "rb") as f:
+            engine = runtime.deserialize_cuda_engine(f.read())
+        context = engine.create_execution_context()
+
+        host_inputs = []
+        cuda_inputs = []
+        host_outputs = []
+        cuda_outputs = []
+        input_binding_names = []
+        output_binding_names = []
+
+        for binding_name in engine:
+            shape = engine.get_tensor_shape(binding_name)
+            print('binding_name:', binding_name, shape)
+            size = trt.volume(shape)
+            dtype = trt.nptype(engine.get_tensor_dtype(binding_name))
+            # Allocate host and device buffers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
+            # Append the device buffer to device bindings.
+            # Append to the appropriate list.
+            if engine.get_tensor_mode(binding_name) == trt.TensorIOMode.INPUT:
+                input_binding_names.append(binding_name)
+                self.input_w = shape[-1]
+                self.input_h = shape[-2]
+                host_inputs.append(host_mem)
+                cuda_inputs.append(cuda_mem)
+            elif engine.get_tensor_mode(binding_name) == trt.TensorIOMode.OUTPUT:
+                output_binding_names.append(binding_name)
+                host_outputs.append(host_mem)
+                cuda_outputs.append(cuda_mem)
+            else:
+                print('unknow:', binding_name)
+
+        # Store
+        self.stream = stream
+        self.context = context
+        self.engine = engine
+        self.host_inputs = host_inputs
+        self.cuda_inputs = cuda_inputs
+        self.host_outputs = host_outputs
+        self.cuda_outputs = cuda_outputs
+        self.input_binding_names = input_binding_names
+        self.output_binding_names = output_binding_names
+        self.batch_size = engine.get_tensor_shape(input_binding_names[0])[0]
+        print('batch_size:', self.batch_size)
+        self.det_output_length = host_outputs[0].shape[0]
+
+    def infer(self, raw_image_generator):
+        threading.Thread.__init__(self)
+        # Make self the active context, pushing it on top of the context stack.
+        self.ctx.push()
+        # Restore
+        stream = self.stream
+        context = self.context
+        host_inputs = self.host_inputs
+        cuda_inputs = self.cuda_inputs
+        host_outputs = self.host_outputs
+        cuda_outputs = self.cuda_outputs
+        input_binding_names = self.input_binding_names
+        output_binding_names = self.output_binding_names
+        # Do image preprocess
+        batch_image_raw = []
+        batch_origin_h = []
+        batch_origin_w = []
+        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
+        for i, image_raw in enumerate(raw_image_generator):
+            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
+            batch_image_raw.append(image_raw)
+            batch_origin_h.append(origin_h)
+            batch_origin_w.append(origin_w)
+            np.copyto(batch_input_image[i], input_image)
+        batch_input_image = np.ascontiguousarray(batch_input_image)
+
+        # Copy input image to host buffer
+        np.copyto(host_inputs[0], batch_input_image.ravel())
+        start = time.time()
+        # Transfer input data  to the GPU.
+        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
+        # Run inference.
+        context.set_tensor_address(input_binding_names[0], cuda_inputs[0])
+        context.set_tensor_address(output_binding_names[0], cuda_outputs[0])
+        context.execute_async_v3(stream_handle=stream.handle)
+        # Transfer predictions back from the GPU.
+        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
+        # Synchronize the stream
+        stream.synchronize()
+        end = time.time()
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+        # Here we use the first row of output in that batch_size = 1
+        output = host_outputs[0]
+        # Do postprocess
+        for i in range(self.batch_size):
+            result_boxes, result_scores, result_classid = self.post_process(
+                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
+                batch_origin_w[i]
+            )
+            # Draw rectangles and labels on the original image
+            for j in range(len(result_boxes)):
+                box = result_boxes[j]
+                plot_one_box(
+                    box,
+                    batch_image_raw[i],
+                    label="{}:{:.2f}".format(
+                        categories[int(result_classid[j])], result_scores[j]
+                    ),
+                )
+        return batch_image_raw, end - start
+
+    def destroy(self):
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+
+    def get_raw_image(self, image_path_batch):
+        """
+        description: Read an image from image path
+        """
+        for img_path in image_path_batch:
+            yield cv2.imread(img_path)
+
+    def get_raw_image_zeros(self, image_path_batch=None):
+        """
+        description: Ready data for warmup
+        """
+        for _ in range(self.batch_size):
+            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
+
+    def preprocess_image(self, raw_bgr_image):
+        """
+        description: Convert BGR image to RGB,
+                     resize and pad it to target size, normalize to [0,1],
+                     transform to NCHW format.
+        param:
+            input_image_path: str, image path
+        return:
+            image:  the processed image
+            image_raw: the original image
+            h: original height
+            w: original width
+        """
+        image_raw = raw_bgr_image
+        h, w, c = image_raw.shape
+        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
+        # Calculate widht and height and paddings
+        r_w = self.input_w / w
+        r_h = self.input_h / h
+        if r_h > r_w:
+            tw = self.input_w
+            th = int(r_w * h)
+            tx1 = tx2 = 0
+            ty1 = int((self.input_h - th) / 2)
+            ty2 = self.input_h - th - ty1
+        else:
+            tw = int(r_h * w)
+            th = self.input_h
+            tx1 = int((self.input_w - tw) / 2)
+            tx2 = self.input_w - tw - tx1
+            ty1 = ty2 = 0
+        # Resize the image with long side while maintaining ratio
+        image = cv2.resize(image, (tw, th))
+        # Pad the short side with (128,128,128)
+        image = cv2.copyMakeBorder(
+            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
+        )
+        image = image.astype(np.float32)
+        # Normalize to [0,1]
+        image /= 255.0
+        # HWC to CHW format:
+        image = np.transpose(image, [2, 0, 1])
+        # CHW to NCHW format
+        image = np.expand_dims(image, axis=0)
+        # Convert the image to row-major order, also known as "C order":
+        image = np.ascontiguousarray(image)
+        return image, image_raw, h, w
+
+    def xywh2xyxy(self, origin_h, origin_w, x):
+        """
+        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+        param:
+            origin_h:   height of original image
+            origin_w:   width of original image
+            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
+        return:
+            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
+        """
+        y = np.zeros_like(x)
+        r_w = self.input_w / origin_w
+        r_h = self.input_h / origin_h
+        if r_h > r_w:
+            y[:, 0] = x[:, 0]
+            y[:, 2] = x[:, 2]
+            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
+            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
+            y /= r_w
+        else:
+            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
+            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
+            y[:, 1] = x[:, 1]
+            y[:, 3] = x[:, 3]
+            y /= r_h
+
+        return y
+
+    def post_process(self, output, origin_h, origin_w):
+        """
+        description: postprocess the prediction
+        param:
+            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
+            origin_h:   height of original image
+            origin_w:   width of original image
+        return:
+            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
+            result_scores: finally scores, a numpy, each element is the score correspoing to box
+            result_classid: finally classid, a numpy, each element is the classid correspoing to box
+        """
+        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM
+        # Get the num of boxes detected
+        num = int(output[0])
+        # Reshape to a two dimentional ndarray
+        # pred = np.reshape(output[1:], (-1, 38))[:num, :]
+        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
+        # Do nms
+        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
+        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
+        result_scores = boxes[:, 4] if len(boxes) else np.array([])
+        result_classid = boxes[:, 5] if len(boxes) else np.array([])
+        return result_boxes, result_scores, result_classid
+
+    def bbox_iou(self, box1, box2, x1y1x2y2=True):
+        """
+        description: compute the IoU of two bounding boxes
+        param:
+            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            x1y1x2y2: select the coordinate format
+        return:
+            iou: computed iou
+        """
+        if not x1y1x2y2:
+            # Transform from center and width to exact coordinates
+            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+        else:
+            # Get the coordinates of bounding boxes
+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+
+        # Get the coordinates of the intersection rectangle
+        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
+        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
+        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
+        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
+        # Intersection area
+        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
+                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
+        # Union Area
+        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
+        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
+
+        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
+
+        return iou
+
+    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
+        """
+        description: Removes detections with lower object confidence score than 'conf_thres' and performs
+        Non-Maximum Suppression to further filter detections.
+        param:
+            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
+            origin_h: original image height
+            origin_w: original image width
+            conf_thres: a confidence threshold to filter detections
+            nms_thres: a iou threshold to filter detections
+        return:
+            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
+        """
+        # Get the boxes that score > CONF_THRESH
+        boxes = prediction[prediction[:, 4] >= conf_thres]
+        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
+        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
+        # clip the coordinates
+        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
+        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
+        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
+        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
+        # Object confidence
+        confs = boxes[:, 4]
+        # Sort by the confs
+        boxes = boxes[np.argsort(-confs)]
+        # Perform non-maximum suppression
+        keep_boxes = []
+        while boxes.shape[0]:
+            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
+            label_match = boxes[0, -1] == boxes[:, -1]
+            # Indices of boxes with lower confidence scores, large IOUs and matching labels
+            invalid = large_overlap & label_match
+            keep_boxes += [boxes[0]]
+            boxes = boxes[~invalid]
+        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
+        return boxes
+
+
+class inferThread(threading.Thread):
+    def __init__(self, yolov8_wrapper, image_path_batch):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+        self.image_path_batch = image_path_batch
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
+        for i, img_path in enumerate(self.image_path_batch):
+            parent, filename = os.path.split(img_path)
+            save_name = os.path.join('output', filename)
+            # Save image
+            cv2.imwrite(save_name, batch_image_raw[i])
+        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
+
+
+class warmUpThread(threading.Thread):
+    def __init__(self, yolov8_wrapper):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
+        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
+
+
+if __name__ == "__main__":
+    # load custom plugin and engine
+    PLUGIN_LIBRARY = "build/libmyplugins.so"
+    engine_file_path = "yolov8s.engine"
+
+    if len(sys.argv) > 1:
+        engine_file_path = sys.argv[1]
+    if len(sys.argv) > 2:
+        PLUGIN_LIBRARY = sys.argv[2]
+
+    ctypes.CDLL(PLUGIN_LIBRARY)
+
+    # load coco labels
+
+    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
+                  "traffic light",
+                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
+                  "frisbee",
+                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
+                  "surfboard",
+                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
+                  "cell phone",
+                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
+                  "teddy bear",
+                  "hair drier", "toothbrush"]
+
+    if os.path.exists('output/'):
+        shutil.rmtree('output/')
+    os.makedirs('output/')
+    # a YoLov8TRT instance
+    yolov8_wrapper = YoLov8TRT(engine_file_path)
+    try:
+        print('batch size is', yolov8_wrapper.batch_size)
+
+        image_dir = "images/"
+        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
+
+        for i in range(10):
+            # create a new thread to do warm_up
+            thread1 = warmUpThread(yolov8_wrapper)
+            thread1.start()
+            thread1.join()
+        for batch in image_path_batches:
+            # create a new thread to do inference
+            thread1 = inferThread(yolov8_wrapper, batch)
+            thread1.start()
+            thread1.join()
+    finally:
+        # destroy the instance
+        yolov8_wrapper.destroy()
diff --git a/yolov8_pose.cpp b/yolov8_pose.cpp
new file mode 100644
index 0000000..da1b8cf
--- /dev/null
+++ b/yolov8_pose.cpp
@@ -0,0 +1,297 @@
+
+#include <fstream>
+#include <iostream>
+#include <opencv2/opencv.hpp>
+#include "cuda_utils.h"
+#include "logging.h"
+#include "model.h"
+#include "postprocess.h"
+#include "preprocess.h"
+#include "utils.h"
+
+Logger gLogger;
+using namespace nvinfer1;
+const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 32) / sizeof(float) + 1;
+
+void serialize_engine(std::string& wts_name, std::string& engine_name, int& is_p, std::string& sub_type, float& gd,
+                      float& gw, int& max_channels) {
+    IBuilder* builder = createInferBuilder(gLogger);
+    IBuilderConfig* config = builder->createBuilderConfig();
+    IHostMemory* serialized_engine = nullptr;
+
+    if (is_p == 6) {
+        serialized_engine = buildEngineYolov8PoseP6(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    } else if (is_p == 2) {
+        std::cout << "p2 is not supported right now" << std::endl;
+    } else {
+        serialized_engine = buildEngineYolov8Pose(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+    }
+
+    assert(serialized_engine);
+    std::ofstream p(engine_name, std::ios::binary);
+    if (!p) {
+        std::cout << "could not open plan output file" << std::endl;
+        assert(false);
+    }
+    p.write(reinterpret_cast<const char*>(serialized_engine->data()), serialized_engine->size());
+
+    delete serialized_engine;
+    delete config;
+    delete builder;
+}
+
+void deserialize_engine(std::string& engine_name, IRuntime** runtime, ICudaEngine** engine,
+                        IExecutionContext** context) {
+    std::ifstream file(engine_name, std::ios::binary);
+    if (!file.good()) {
+        std::cerr << "read " << engine_name << " error!" << std::endl;
+        assert(false);
+    }
+    size_t size = 0;
+    file.seekg(0, file.end);
+    size = file.tellg();
+    file.seekg(0, file.beg);
+    char* serialized_engine = new char[size];
+    assert(serialized_engine);
+    file.read(serialized_engine, size);
+    file.close();
+
+    *runtime = createInferRuntime(gLogger);
+    assert(*runtime);
+    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
+    assert(*engine);
+    *context = (*engine)->createExecutionContext();
+    assert(*context);
+    delete[] serialized_engine;
+}
+
+void prepare_buffer(ICudaEngine* engine, float** input_buffer_device, float** output_buffer_device,
+                    float** output_buffer_host, float** decode_ptr_host, float** decode_ptr_device,
+                    std::string cuda_post_process) {
+    assert(engine->getNbIOTensors() == 2);
+    // In order to bind the buffers, we need to know the names of the input and output tensors.
+    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+    TensorIOMode input_mode = engine->getTensorIOMode(kInputTensorName);
+    if (input_mode != TensorIOMode::kINPUT) {
+        std::cerr << kInputTensorName << " should be input tensor" << std::endl;
+        assert(false);
+    }
+    TensorIOMode output_mode = engine->getTensorIOMode(kOutputTensorName);
+    if (output_mode != TensorIOMode::kOUTPUT) {
+        std::cerr << kOutputTensorName << " should be output tensor" << std::endl;
+        assert(false);
+    }
+    // Create GPU buffers on device
+    CUDA_CHECK(cudaMalloc((void**)input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void**)output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
+    if (cuda_post_process == "c") {
+        *output_buffer_host = new float[kBatchSize * kOutputSize];
+    } else if (cuda_post_process == "g") {
+        if (kBatchSize > 1) {
+            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
+            exit(0);
+        }
+        // Allocate memory for decode_ptr_host and copy to device
+        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
+        CUDA_CHECK(cudaMalloc((void**)decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
+    }
+}
+
+void infer(IExecutionContext& context, cudaStream_t& stream, void** buffers, float* output, int batchsize,
+           float* decode_ptr_host, float* decode_ptr_device, int model_bboxes, std::string cuda_post_process) {
+    // infer on the batch asynchronously, and DMA output back to host
+    auto start = std::chrono::system_clock::now();
+    context.setInputTensorAddress(kInputTensorName, buffers[0]);
+    context.setOutputTensorAddress(kOutputTensorName, buffers[1]);
+    context.enqueueV3(stream);
+    if (cuda_post_process == "c") {
+        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
+                                   stream));
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+                  << "ms" << std::endl;
+    } else if (cuda_post_process == "g") {
+        CUDA_CHECK(
+                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
+        cuda_decode((float*)buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
+        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
+        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
+                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
+                                   stream));
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference and gpu postprocess time: "
+                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+    }
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+bool parse_args(int argc, char** argv, std::string& wts, std::string& engine, int& is_p, std::string& img_dir,
+                std::string& sub_type, std::string& cuda_post_process, float& gd, float& gw, int& max_channels) {
+    if (argc < 4)
+        return false;
+    if (std::string(argv[1]) == "-s" && (argc == 5 || argc == 7)) {
+        wts = std::string(argv[2]);
+        engine = std::string(argv[3]);
+        auto sub_type = std::string(argv[4]);
+
+        if (sub_type[0] == 'n') {
+            gd = 0.33;
+            gw = 0.25;
+            max_channels = 1024;
+        } else if (sub_type[0] == 's') {
+            gd = 0.33;
+            gw = 0.50;
+            max_channels = 1024;
+        } else if (sub_type[0] == 'm') {
+            gd = 0.67;
+            gw = 0.75;
+            max_channels = 576;
+        } else if (sub_type[0] == 'l') {
+            gd = 1.0;
+            gw = 1.0;
+            max_channels = 512;
+        } else if (sub_type[0] == 'x') {
+            gd = 1.0;
+            gw = 1.25;
+            max_channels = 640;
+        } else {
+            return false;
+        }
+        if (sub_type.size() == 2 && sub_type[1] == '6') {
+            is_p = 6;
+        } else if (sub_type.size() == 2 && sub_type[1] == '2') {
+            is_p = 2;
+        }
+    } else if (std::string(argv[1]) == "-d" && argc == 5) {
+        engine = std::string(argv[2]);
+        img_dir = std::string(argv[3]);
+        cuda_post_process = std::string(argv[4]);
+    } else {
+        return false;
+    }
+    return true;
+}
+
+int main(int argc, char** argv) {
+    // -s ../models/yolov8n-pose.wts ../models/yolov8n-pose.fp32.trt n
+    // -d ../models/yolov8n-pose.fp32.trt ../images c
+    cudaSetDevice(kGpuId);
+    std::string wts_name = "";
+    std::string engine_name = "";
+    std::string img_dir;
+    std::string sub_type = "";
+    std::string cuda_post_process = "";
+    int model_bboxes;
+    int is_p = 0;
+    float gd = 0.0f, gw = 0.0f;
+    int max_channels = 0;
+
+    if (!parse_args(argc, argv, wts_name, engine_name, is_p, img_dir, sub_type, cuda_post_process, gd, gw,
+                    max_channels)) {
+        std::cerr << "Arguments not right!" << std::endl;
+        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x/n2/s2/m2/l2/x2/n6/s6/m6/l6/x6]  // serialize model to "
+                     "plan file"
+                  << std::endl;
+        std::cerr << "./yolov8 -d [.engine] ../samples  [c/g]// deserialize plan file and run inference" << std::endl;
+        return -1;
+    }
+
+    // Create a model using the API directly and serialize it to a file
+    if (!wts_name.empty()) {
+        serialize_engine(wts_name, engine_name, is_p, sub_type, gd, gw, max_channels);
+        return 0;
+    }
+
+    // Deserialize the engine from file
+    IRuntime* runtime = nullptr;
+    ICudaEngine* engine = nullptr;
+    IExecutionContext* context = nullptr;
+    deserialize_engine(engine_name, &runtime, &engine, &context);
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    cuda_preprocess_init(kMaxInputImageSize);
+    auto out_dims = engine->getTensorShape(kOutputTensorName);
+    model_bboxes = out_dims.d[1];
+    // Prepare cpu and gpu buffers
+    float* device_buffers[2];
+    float* output_buffer_host = nullptr;
+    float* decode_ptr_host = nullptr;
+    float* decode_ptr_device = nullptr;
+
+    // Read images from directory
+    std::vector<std::string> file_names;
+    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
+        std::cerr << "read_files_in_dir failed." << std::endl;
+        return -1;
+    }
+
+    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &output_buffer_host, &decode_ptr_host,
+                   &decode_ptr_device, cuda_post_process);
+
+    // batch predict
+    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
+        // Get a batch of images
+        std::vector<cv::Mat> img_batch;
+        std::vector<std::string> img_name_batch;
+        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
+            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
+            img_batch.push_back(img);
+            img_name_batch.push_back(file_names[j]);
+        }
+        // Preprocess
+        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
+        // Run inference
+        infer(*context, stream, (void**)device_buffers, output_buffer_host, kBatchSize, decode_ptr_host,
+              decode_ptr_device, model_bboxes, cuda_post_process);
+        std::vector<std::vector<Detection>> res_batch;
+        if (cuda_post_process == "c") {
+            // NMS
+            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
+        } else if (cuda_post_process == "g") {
+            // Process gpu decode and nms results
+            // todo pose in gpu
+            std::cerr << "pose_postprocess is not support in gpu right now" << std::endl;
+        }
+        // Draw bounding boxes
+        draw_bbox_keypoints_line(img_batch, res_batch);
+        // Save images
+        for (size_t j = 0; j < img_batch.size(); j++) {
+            cv::imwrite("_" + img_name_batch[j], img_batch[j]);
+        }
+
+        // print results
+        for (size_t j = 0; j < res_batch.size(); j++) {
+            for (size_t k = 0; k < res_batch[j].size(); k++) {
+                std::cout << "image: " << img_name_batch[j] << ", bbox: " << res_batch[j][k].bbox[0] << ", "
+                          << res_batch[j][k].bbox[1] << ", " << res_batch[j][k].bbox[2] << ", "
+                          << res_batch[j][k].bbox[3] << ", conf: " << res_batch[j][k].conf << ", class_id: "
+                          << res_batch[j][k].class_id << std::endl;
+            }
+        }
+    }
+
+    // Release stream and buffers
+    cudaStreamDestroy(stream);
+    CUDA_CHECK(cudaFree(device_buffers[0]));
+    CUDA_CHECK(cudaFree(device_buffers[1]));
+    CUDA_CHECK(cudaFree(decode_ptr_device));
+    delete[] decode_ptr_host;
+    delete[] output_buffer_host;
+    cuda_preprocess_destroy();
+    // Destroy the engine
+    delete context;
+    delete engine;
+    delete runtime;
+
+    // Print histogram of the output distribution
+    //std::cout << "\nOutput:\n\n";
+    //for (unsigned int i = 0; i < kOutputSize; i++)
+    //{
+    //    std::cout << prob[i] << ", ";
+    //    if (i % 10 == 0) std::cout << std::endl;
+    //}
+    //std::cout << std::endl;
+
+    return 0;
+}
diff --git a/yolov8_pose_trt.py b/yolov8_pose_trt.py
new file mode 100644
index 0000000..a4a1183
--- /dev/null
+++ b/yolov8_pose_trt.py
@@ -0,0 +1,511 @@
+"""
+An example that uses TensorRT's Python api to make inferences.
+"""
+import ctypes
+import os
+import shutil
+import random
+import sys
+import threading
+import time
+import cv2
+import numpy as np
+import pycuda.autoinit  # noqa: F401
+import pycuda.driver as cuda
+import tensorrt as trt
+
+CONF_THRESH = 0.5
+IOU_THRESHOLD = 0.4
+POSE_NUM = 17 * 3
+DET_NUM = 6
+SEG_NUM = 32
+keypoint_pairs = [
+    (0, 1), (0, 2), (0, 5), (0, 6), (1, 2),
+    (1, 3), (2, 4), (5, 6), (5, 7), (5, 11),
+    (6, 8), (6, 12), (7, 9), (8, 10), (11, 12),
+    (11, 13), (12, 14), (13, 15), (14, 16)
+]
+
+
+def get_img_path_batches(batch_size, img_dir):
+    ret = []
+    batch = []
+    for root, dirs, files in os.walk(img_dir):
+        for name in files:
+            if len(batch) == batch_size:
+                ret.append(batch)
+                batch = []
+            batch.append(os.path.join(root, name))
+    if len(batch) > 0:
+        ret.append(batch)
+    return ret
+
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+    """
+    description: Plots one bounding box on image img,
+                 this function comes from YoLov8 project.
+    param:
+        x:      a box likes [x1,y1,x2,y2]
+        img:    a opencv image object
+        color:  color to draw rectangle, such as (0,255,0)
+        label:  str
+        line_thickness: int
+    return:
+        no return
+
+    """
+    tl = (
+            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
+    )  # line/font thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
+        cv2.putText(
+            img,
+            label,
+            (c1[0], c1[1] - 2),
+            0,
+            tl / 3,
+            [225, 255, 255],
+            thickness=tf,
+            lineType=cv2.LINE_AA,
+        )
+
+
+class YoLov8TRT(object):
+    """
+    description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
+    """
+
+    def __init__(self, engine_file_path):
+        # Create a Context on this device,
+        self.ctx = cuda.Device(0).make_context()
+        stream = cuda.Stream()
+        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+        runtime = trt.Runtime(TRT_LOGGER)
+
+        # Deserialize the engine from file
+        with open(engine_file_path, "rb") as f:
+            engine = runtime.deserialize_cuda_engine(f.read())
+        context = engine.create_execution_context()
+
+        host_inputs = []
+        cuda_inputs = []
+        host_outputs = []
+        cuda_outputs = []
+        input_binding_names = []
+        output_binding_names = []
+
+        for binding_name in engine:
+            shape = engine.get_tensor_shape(binding_name)
+            print('binding_name:', binding_name, shape)
+            size = trt.volume(shape)
+            dtype = trt.nptype(engine.get_tensor_dtype(binding_name))
+            # Allocate host and device buffers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
+            # Append the device buffer to device bindings.
+            # Append to the appropriate list.
+            if engine.get_tensor_mode(binding_name) == trt.TensorIOMode.INPUT:
+                input_binding_names.append(binding_name)
+                self.input_w = shape[-1]
+                self.input_h = shape[-2]
+                host_inputs.append(host_mem)
+                cuda_inputs.append(cuda_mem)
+            elif engine.get_tensor_mode(binding_name) == trt.TensorIOMode.OUTPUT:
+                output_binding_names.append(binding_name)
+                host_outputs.append(host_mem)
+                cuda_outputs.append(cuda_mem)
+            else:
+                print('unknow:', binding_name)
+
+        # Store
+        self.stream = stream
+        self.context = context
+        self.host_inputs = host_inputs
+        self.cuda_inputs = cuda_inputs
+        self.host_outputs = host_outputs
+        self.cuda_outputs = cuda_outputs
+        self.input_binding_names = input_binding_names
+        self.output_binding_names = output_binding_names
+        self.batch_size = engine.get_tensor_shape(input_binding_names[0])[0]
+        self.det_output_size = host_outputs[0].shape[0]
+        print('batch_size:', self.batch_size)
+
+    def infer(self, raw_image_generator):
+        threading.Thread.__init__(self)
+        # Make self the active context, pushing it on top of the context stack.
+        self.ctx.push()
+        # Restore
+        stream = self.stream
+        context = self.context
+        host_inputs = self.host_inputs
+        cuda_inputs = self.cuda_inputs
+        host_outputs = self.host_outputs
+        cuda_outputs = self.cuda_outputs
+        input_binding_names = self.input_binding_names
+        output_binding_names = self.output_binding_names
+        # Do image preprocess
+        batch_image_raw = []
+        batch_origin_h = []
+        batch_origin_w = []
+        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
+        for i, image_raw in enumerate(raw_image_generator):
+            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
+            batch_image_raw.append(image_raw)
+            batch_origin_h.append(origin_h)
+            batch_origin_w.append(origin_w)
+            np.copyto(batch_input_image[i],
+                      input_image)
+        batch_input_image = np.ascontiguousarray(batch_input_image)
+
+        # Copy input image to host buffer
+        np.copyto(host_inputs[0], batch_input_image.ravel())
+        start = time.time()
+        # Transfer input data  to the GPU.
+        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
+        # Run inference.
+        context.set_tensor_address(input_binding_names[0], cuda_inputs[0])
+        context.set_tensor_address(output_binding_names[0], cuda_outputs[0])
+        context.execute_async_v3(stream_handle=stream.handle)
+        # Transfer predictions back from the GPU.
+        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
+        # Synchronize the stream
+        stream.synchronize()
+        end = time.time()
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+        # Here we use the first row of output in that batch_size = 1
+        output = host_outputs[0]
+        # Do postprocess
+        for i in range(self.batch_size):
+
+            result_boxes, result_scores, result_classid, keypoints = self.post_process(
+                output[i * (self.det_output_size): (i + 1) * (self.det_output_size)],
+                batch_origin_h[i], batch_origin_w[i]
+            )
+
+            # Draw rectangles and labels on the original image
+            for j in range(len(result_boxes)):
+                box = result_boxes[j]
+                plot_one_box(
+                    box,
+                    batch_image_raw[i],
+                    label="{}:{:.2f}".format(
+                        categories[int(result_classid[j])], result_scores[j]
+                    ),
+                )
+
+                num_keypoints = len(keypoints[j]) // 3
+                points = []
+                for k in range(num_keypoints):
+                    x = keypoints[j][k * 3]
+                    y = keypoints[j][k * 3 + 1]
+                    confidence = keypoints[j][k * 3 + 2]
+                    if confidence > 0:
+                        points.append((int(x), int(y)))
+                    else:
+                        points.append(None)
+
+                # 根据关键点索引对绘制线条
+                for pair in keypoint_pairs:
+                    partA, partB = pair
+                    if points[partA] and points[partB]:
+                        cv2.line(batch_image_raw[i], points[partA], points[partB], (0, 255, 0), 2)
+
+        return batch_image_raw, end - start
+
+    def destroy(self):
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+
+    def get_raw_image(self, image_path_batch):
+        """
+        description: Read an image from image path
+        """
+        for img_path in image_path_batch:
+            yield cv2.imread(img_path)
+
+    def get_raw_image_zeros(self, image_path_batch=None):
+        """
+        description: Ready data for warmup
+        """
+        for _ in range(self.batch_size):
+            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
+
+    def preprocess_image(self, raw_bgr_image):
+        """
+        description: Convert BGR image to RGB,
+                     resize and pad it to target size, normalize to [0,1],
+                     transform to NCHW format.
+        param:
+            input_image_path: str, image path
+        return:
+            image:  the processed image
+            image_raw: the original image
+            h: original height
+            w: original width
+        """
+        image_raw = raw_bgr_image
+        h, w, c = image_raw.shape
+        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
+        # Calculate widht and height and paddings
+        r_w = self.input_w / w
+        r_h = self.input_h / h
+        if r_h > r_w:
+            tw = self.input_w
+            th = int(r_w * h)
+            tx1 = tx2 = 0
+            ty1 = int((self.input_h - th) / 2)
+            ty2 = self.input_h - th - ty1
+        else:
+            tw = int(r_h * w)
+            th = self.input_h
+            tx1 = int((self.input_w - tw) / 2)
+            tx2 = self.input_w - tw - tx1
+            ty1 = ty2 = 0
+        # Resize the image with long side while maintaining ratio
+        image = cv2.resize(image, (tw, th))
+        # Pad the short side with (128,128,128)
+        image = cv2.copyMakeBorder(
+            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
+        )
+        image = image.astype(np.float32)
+        # Normalize to [0,1]
+        image /= 255.0
+        # HWC to CHW format:
+        image = np.transpose(image, [2, 0, 1])
+        # CHW to NCHW format
+        image = np.expand_dims(image, axis=0)
+        # Convert the image to row-major order, also known as "C order":
+        image = np.ascontiguousarray(image)
+        return image, image_raw, h, w
+
+    def xywh2xyxy_with_keypoints(self, origin_h, origin_w, boxes, keypoints):
+
+        n = len(boxes)
+        box_array = np.zeros_like(boxes)
+        keypoint_array = np.zeros_like(keypoints)
+        r_w = self.input_w / origin_w
+        r_h = self.input_h / origin_h
+        for i in range(n):
+            if r_h > r_w:
+                box = boxes[i]
+                lmk = keypoints[i]
+                box_array[i, 0] = box[0] / r_w
+                box_array[i, 2] = box[2] / r_w
+                box_array[i, 1] = (box[1] - (self.input_h - r_w * origin_h) / 2) / r_w
+                box_array[i, 3] = (box[3] - (self.input_h - r_w * origin_h) / 2) / r_w
+
+                for j in range(0, len(lmk), 3):
+                    keypoint_array[i, j] = lmk[j] / r_w
+                    keypoint_array[i, j + 1] = (lmk[j + 1] - (self.input_h - r_w * origin_h) / 2) / r_w
+                    keypoint_array[i, j + 2] = lmk[j + 2]
+            else:
+
+                box = boxes[i]
+                lmk = keypoints[i]
+
+                box_array[i, 0] = (box[0] - (self.input_w - r_h * origin_w) / 2) / r_h
+                box_array[i, 2] = (box[2] - (self.input_w - r_h * origin_w) / 2) / r_h
+                box_array[i, 1] = box[1] / r_h
+                box_array[i, 3] = box[3] / r_h
+
+                for j in range(0, len(lmk), 3):
+                    keypoint_array[i, j] = (lmk[j] - (self.input_w - r_h * origin_w) / 2) / r_h
+                    keypoint_array[i, j + 1] = lmk[j + 1] / r_h
+                    keypoint_array[i, j + 2] = lmk[j + 2]
+
+        return box_array, keypoint_array
+
+    def post_process(self, output, origin_h, origin_w):
+        """
+        description: Post-process the prediction to include pose keypoints
+        param:
+            output:     A numpy array like [num_boxes, cx, cy, w, h, conf,
+            cls_id, px1, py1, pconf1,...px17, py17, pconf17] where p denotes pose keypoint
+            origin_h:   Height of original image
+            origin_w:   Width of original image
+        return:
+            result_boxes:    Final boxes, a numpy array, each row is a box [x1, y1, x2, y2]
+            result_scores:   Final scores, a numpy array, each element is the score corresponding to box
+            result_classid:  Final classID, a numpy array, each element is the classid corresponding to box
+            result_keypoints: Final keypoints, a list of numpy arrays,
+            each element represents keypoints for a box, shaped as (#keypoints, 3)
+        """
+        # Number of values per detection: 38 base values + 17 keypoints * 3 values each
+        num_values_per_detection = DET_NUM + SEG_NUM + POSE_NUM
+        # Get the number of boxes detected
+        num = int(output[0])
+        # Reshape to a two-dimensional ndarray with the full detection shape
+        pred = np.reshape(output[1:], (-1, num_values_per_detection))[:num, :]
+
+        # Perform non-maximum suppression to filter the detections
+        boxes = self.non_max_suppression(
+            pred[:, :num_values_per_detection], origin_h, origin_w,
+            conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
+
+        # Extract the bounding boxes, confidence scores, and class IDs
+        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
+        result_scores = boxes[:, 4] if len(boxes) else np.array([])
+        result_classid = boxes[:, 5] if len(boxes) else np.array([])
+        result_keypoints = boxes[:, -POSE_NUM:] if len(boxes) else np.array([])
+
+        # Return the post-processed results including keypoints
+        return result_boxes, result_scores, result_classid, result_keypoints
+
+    def bbox_iou(self, box1, box2, x1y1x2y2=True):
+        """
+        description: compute the IoU of two bounding boxes
+        param:
+            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            x1y1x2y2: select the coordinate format
+        return:
+            iou: computed iou
+        """
+        if not x1y1x2y2:
+            # Transform from center and width to exact coordinates
+            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+        else:
+            # Get the coordinates of bounding boxes
+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+
+        # Get the coordinates of the intersection rectangle
+        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
+        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
+        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
+        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
+        # Intersection area
+        inter_area = np.clip(
+            inter_rect_x2 - inter_rect_x1 + 1, 0, None) * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
+        # Union Area
+        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
+        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
+
+        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
+
+        return iou
+
+    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
+        """
+        description: Removes detections with lower object confidence score than 'conf_thres' and performs
+        Non-Maximum Suppression to further filter detections.
+        param:
+            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
+            origin_h: original image height
+            origin_w: original image width
+            conf_thres: a confidence threshold to filter detections
+            nms_thres: a iou threshold to filter detections
+        return:
+            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
+        """
+        # Get the boxes that score > CONF_THRESH
+        boxes = prediction[prediction[:, 4] >= conf_thres]
+        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
+        res_array = np.copy(boxes)
+        box_pred_deep_copy = np.copy(boxes[:, :4])
+        keypoints_pred_deep_copy = np.copy(boxes[:, -POSE_NUM:])
+        res_box, res_keypoints = self.xywh2xyxy_with_keypoints(
+            origin_h, origin_w, box_pred_deep_copy, keypoints_pred_deep_copy)
+        res_array[:, :4] = res_box
+        res_array[:, -POSE_NUM:] = res_keypoints
+        # clip the coordinates
+        res_array[:, 0] = np.clip(res_array[:, 0], 0, origin_w - 1)
+        res_array[:, 2] = np.clip(res_array[:, 2], 0, origin_w - 1)
+        res_array[:, 1] = np.clip(res_array[:, 1], 0, origin_h - 1)
+        res_array[:, 3] = np.clip(res_array[:, 3], 0, origin_h - 1)
+        # Object confidence
+        confs = res_array[:, 4]
+        # Sort by the confs
+        res_array = res_array[np.argsort(-confs)]
+        # Perform non-maximum suppression
+        keep_res_array = []
+        while res_array.shape[0]:
+            large_overlap = self.bbox_iou(np.expand_dims(res_array[0, :4], 0), res_array[:, :4]) > nms_thres
+            label_match = res_array[0, 5] == res_array[:, 5]
+            invalid = large_overlap & label_match
+            keep_res_array.append(res_array[0])
+            res_array = res_array[~invalid]
+
+        res_array = np.stack(keep_res_array, 0) if len(keep_res_array) else np.array([])
+        return res_array
+
+
+class inferThread(threading.Thread):
+    def __init__(self, yolov8_wrapper, image_path_batch):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+        self.image_path_batch = image_path_batch
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
+        for i, img_path in enumerate(self.image_path_batch):
+            parent, filename = os.path.split(img_path)
+            save_name = os.path.join('output', filename)
+            # Save image
+
+            cv2.imwrite(save_name, batch_image_raw[i])
+        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
+
+
+class warmUpThread(threading.Thread):
+    def __init__(self, yolov8_wrapper):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
+        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
+
+
+if __name__ == "__main__":
+    # load custom plugin and engine
+    PLUGIN_LIBRARY = "./build/libmyplugins.so"
+    engine_file_path = "yolov8n-pose.engine"
+
+    if len(sys.argv) > 1:
+        engine_file_path = sys.argv[1]
+    if len(sys.argv) > 2:
+        PLUGIN_LIBRARY = sys.argv[2]
+
+    ctypes.CDLL(PLUGIN_LIBRARY)
+
+    # load coco labels
+
+    categories = ["person"]
+
+    if os.path.exists('output/'):
+        shutil.rmtree('output/')
+    os.makedirs('output/')
+    # a YoLov8TRT instance
+    yolov8_wrapper = YoLov8TRT(engine_file_path)
+    try:
+        print('batch size is', yolov8_wrapper.batch_size)
+
+        image_dir = "images/"
+        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
+
+        for i in range(10):
+            # create a new thread to do warm_up
+            thread1 = warmUpThread(yolov8_wrapper)
+            thread1.start()
+            thread1.join()
+        for batch in image_path_batches:
+            # create a new thread to do inference
+            thread1 = inferThread(yolov8_wrapper, batch)
+            thread1.start()
+            thread1.join()
+    finally:
+        # destroy the instance
+        yolov8_wrapper.destroy()
diff --git a/yolov8_seg.cpp b/yolov8_seg.cpp
new file mode 100644
index 0000000..dce442f
--- /dev/null
+++ b/yolov8_seg.cpp
@@ -0,0 +1,354 @@
+#include <fstream>
+#include <iostream>
+#include <opencv2/opencv.hpp>
+#include "cuda_utils.h"
+#include "logging.h"
+#include "model.h"
+#include "postprocess.h"
+#include "preprocess.h"
+#include "utils.h"
+
+Logger gLogger;
+using namespace nvinfer1;
+const int kOutputSize = kMaxNumOutputBbox * (sizeof(Detection) - sizeof(float) * 51) / sizeof(float) + 1;
+const static int kOutputSegSize = 32 * (kInputH / 4) * (kInputW / 4);
+
+static cv::Rect get_downscale_rect(float bbox[4], float scale) {
+
+    float left = bbox[0];
+    float top = bbox[1];
+    float right = bbox[0] + bbox[2];
+    float bottom = bbox[1] + bbox[3];
+
+    left = left < 0 ? 0 : left;
+    top = top < 0 ? 0 : top;
+    right = right > 640 ? 640 : right;
+    bottom = bottom > 640 ? 640 : bottom;
+
+    left /= scale;
+    top /= scale;
+    right /= scale;
+    bottom /= scale;
+    return cv::Rect(int(left), int(top), int(right - left), int(bottom - top));
+}
+
+std::vector <cv::Mat> process_mask(const float *proto, int proto_size, std::vector <Detection> &dets) {
+
+    std::vector <cv::Mat> masks;
+    for (size_t i = 0; i < dets.size(); i++) {
+
+        cv::Mat mask_mat = cv::Mat::zeros(kInputH / 4, kInputW / 4, CV_32FC1);
+        auto r = get_downscale_rect(dets[i].bbox, 4);
+
+        for (int x = r.x; x < r.x + r.width; x++) {
+            for (int y = r.y; y < r.y + r.height; y++) {
+                float e = 0.0f;
+                for (int j = 0; j < 32; j++) {
+                    e += dets[i].mask[j] * proto[j * proto_size / 32 + y * mask_mat.cols + x];
+                }
+                e = 1.0f / (1.0f + expf(-e));
+                mask_mat.at<float>(y, x) = e;
+            }
+        }
+        cv::resize(mask_mat, mask_mat, cv::Size(kInputW, kInputH));
+        masks.push_back(mask_mat);
+    }
+    return masks;
+}
+
+void serialize_engine(std::string &wts_name, std::string &engine_name, std::string &sub_type, float &gd, float &gw,
+                      int &max_channels) {
+    IBuilder *builder = createInferBuilder(gLogger);
+    IBuilderConfig *config = builder->createBuilderConfig();
+    IHostMemory *serialized_engine = nullptr;
+
+    serialized_engine = buildEngineYolov8Seg(builder, config, DataType::kFLOAT, wts_name, gd, gw, max_channels);
+
+    assert(serialized_engine);
+    std::ofstream p(engine_name, std::ios::binary);
+    if (!p) {
+        std::cout << "could not open plan output file" << std::endl;
+        assert(false);
+    }
+    p.write(reinterpret_cast<const char *>(serialized_engine->data()), serialized_engine->size());
+
+    delete serialized_engine;
+    delete config;
+    delete builder;
+}
+
+void deserialize_engine(std::string &engine_name, IRuntime **runtime, ICudaEngine **engine,
+                        IExecutionContext **context) {
+    std::ifstream file(engine_name, std::ios::binary);
+    if (!file.good()) {
+        std::cerr << "read " << engine_name << " error!" << std::endl;
+        assert(false);
+    }
+    size_t size = 0;
+    file.seekg(0, file.end);
+    size = file.tellg();
+    file.seekg(0, file.beg);
+    char *serialized_engine = new char[size];
+    assert(serialized_engine);
+    file.read(serialized_engine, size);
+    file.close();
+
+    *runtime = createInferRuntime(gLogger);
+    assert(*runtime);
+    *engine = (*runtime)->deserializeCudaEngine(serialized_engine, size);
+    assert(*engine);
+    *context = (*engine)->createExecutionContext();
+    assert(*context);
+    delete[] serialized_engine;
+}
+
+void prepare_buffer(ICudaEngine *engine, float **input_buffer_device, float **output_buffer_device,
+                    float **output_seg_buffer_device, float **output_buffer_host, float **output_seg_buffer_host,
+                    float **decode_ptr_host, float **decode_ptr_device, std::string cuda_post_process) {
+    assert(engine->getNbIOTensors() == 3);
+    // In order to bind the buffers, we need to know the names of the input and output tensors.
+    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+    TensorIOMode input_mode = engine->getTensorIOMode(kInputTensorName);
+    if (input_mode != TensorIOMode::kINPUT) {
+        std::cerr << kInputTensorName << " should be input tensor" << std::endl;
+        assert(false);
+    }
+    TensorIOMode output_mode = engine->getTensorIOMode(kOutputTensorName);
+    if (output_mode != TensorIOMode::kOUTPUT) {
+        std::cerr << kOutputTensorName << " should be output tensor" << std::endl;
+        assert(false);
+    }
+    TensorIOMode proto_mode = engine->getTensorIOMode(kProtoTensorName);
+    if (proto_mode != TensorIOMode::kOUTPUT) {
+        std::cerr << kProtoTensorName << " should be output tensor" << std::endl;
+        assert(false);
+    }
+    // Create GPU buffers on device
+    CUDA_CHECK(cudaMalloc((void **) input_buffer_device, kBatchSize * 3 * kInputH * kInputW * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void **) output_buffer_device, kBatchSize * kOutputSize * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void **) output_seg_buffer_device, kBatchSize * kOutputSegSize * sizeof(float)));
+
+    if (cuda_post_process == "c") {
+        *output_buffer_host = new float[kBatchSize * kOutputSize];
+        *output_seg_buffer_host = new float[kBatchSize * kOutputSegSize];
+    } else if (cuda_post_process == "g") {
+        if (kBatchSize > 1) {
+            std::cerr << "Do not yet support GPU post processing for multiple batches" << std::endl;
+            exit(0);
+        }
+        // Allocate memory for decode_ptr_host and copy to device
+        *decode_ptr_host = new float[1 + kMaxNumOutputBbox * bbox_element];
+        CUDA_CHECK(cudaMalloc((void **) decode_ptr_device, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element)));
+    }
+}
+
+void infer(IExecutionContext &context, cudaStream_t &stream, void **buffers, float *output, float *output_seg,
+           int batchsize, float *decode_ptr_host, float *decode_ptr_device, int model_bboxes,
+           std::string cuda_post_process) {
+    // infer on the batch asynchronously, and DMA output back to host
+    auto start = std::chrono::system_clock::now();
+    context.setInputTensorAddress(kInputTensorName, buffers[0]);
+    context.setOutputTensorAddress(kOutputTensorName, buffers[1]);
+    context.setOutputTensorAddress(kProtoTensorName, buffers[2]);
+    context.enqueueV3(stream);
+    if (cuda_post_process == "c") {
+
+        std::cout << "kOutputSize:" << kOutputSize << std::endl;
+        CUDA_CHECK(cudaMemcpyAsync(output, buffers[1], batchsize * kOutputSize * sizeof(float), cudaMemcpyDeviceToHost,
+                                   stream));
+        std::cout << "kOutputSegSize:" << kOutputSegSize << std::endl;
+        CUDA_CHECK(cudaMemcpyAsync(output_seg, buffers[2], batchsize * kOutputSegSize * sizeof(float),
+                                   cudaMemcpyDeviceToHost, stream));
+
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+                  << "ms" << std::endl;
+    } else if (cuda_post_process == "g") {
+        CUDA_CHECK(
+                cudaMemsetAsync(decode_ptr_device, 0, sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), stream));
+        cuda_decode((float *) buffers[1], model_bboxes, kConfThresh, decode_ptr_device, kMaxNumOutputBbox, stream);
+        cuda_nms(decode_ptr_device, kNmsThresh, kMaxNumOutputBbox, stream);  //cuda nms
+        CUDA_CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device,
+                                   sizeof(float) * (1 + kMaxNumOutputBbox * bbox_element), cudaMemcpyDeviceToHost,
+                                   stream));
+        auto end = std::chrono::system_clock::now();
+        std::cout << "inference and gpu postprocess time: "
+                  << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+    }
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+bool parse_args(int argc, char **argv, std::string &wts, std::string &engine, std::string &img_dir,
+                std::string &sub_type, std::string &cuda_post_process, std::string &labels_filename, float &gd,
+                float &gw, int &max_channels) {
+    if (argc < 4)
+        return false;
+    if (std::string(argv[1]) == "-s" && argc == 5) {
+        wts = std::string(argv[2]);
+        engine = std::string(argv[3]);
+        sub_type = std::string(argv[4]);
+        if (sub_type == "n") {
+            gd = 0.33;
+            gw = 0.25;
+            max_channels = 1024;
+        } else if (sub_type == "s") {
+            gd = 0.33;
+            gw = 0.50;
+            max_channels = 1024;
+        } else if (sub_type == "m") {
+            gd = 0.67;
+            gw = 0.75;
+            max_channels = 576;
+        } else if (sub_type == "l") {
+            gd = 1.0;
+            gw = 1.0;
+            max_channels = 512;
+        } else if (sub_type == "x") {
+            gd = 1.0;
+            gw = 1.25;
+            max_channels = 640;
+        } else {
+            return false;
+        }
+    } else if (std::string(argv[1]) == "-d" && argc == 6) {
+        engine = std::string(argv[2]);
+        img_dir = std::string(argv[3]);
+        cuda_post_process = std::string(argv[4]);
+        labels_filename = std::string(argv[5]);
+    } else {
+        return false;
+    }
+    return true;
+}
+
+int main(int argc, char **argv) {
+    // -s ../models/yolov8n-seg.wts ../models/yolov8n-seg.fp32.trt n
+    // -d ../models/yolov8n-seg.fp32.trt ../images c coco.txt
+    cudaSetDevice(kGpuId);
+    std::string wts_name = "";
+    std::string engine_name = "";
+    std::string img_dir;
+    std::string sub_type = "";
+    std::string cuda_post_process = "";
+    std::string labels_filename = "../coco.txt";
+    int model_bboxes;
+    float gd = 0.0f, gw = 0.0f;
+    int max_channels = 0;
+
+    if (!parse_args(argc, argv, wts_name, engine_name, img_dir, sub_type, cuda_post_process, labels_filename, gd, gw,
+                    max_channels)) {
+        std::cerr << "Arguments not right!" << std::endl;
+        std::cerr << "./yolov8 -s [.wts] [.engine] [n/s/m/l/x]  // serialize model to plan file" << std::endl;
+        std::cerr << "./yolov8 -d [.engine] ../samples  [c/g] coco_file// deserialize plan file and run inference"
+                  << std::endl;
+        return -1;
+    }
+
+    // Create a model using the API directly and serialize it to a file
+    if (!wts_name.empty()) {
+        serialize_engine(wts_name, engine_name, sub_type, gd, gw, max_channels);
+        return 0;
+    }
+
+    // Deserialize the engine from file
+    IRuntime *runtime = nullptr;
+    ICudaEngine *engine = nullptr;
+    IExecutionContext *context = nullptr;
+    deserialize_engine(engine_name, &runtime, &engine, &context);
+    cudaStream_t stream;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    cuda_preprocess_init(kMaxInputImageSize);
+    auto out_dims = engine->getTensorShape(kOutputTensorName);
+    model_bboxes = out_dims.d[1];
+    // Prepare cpu and gpu buffers
+    float *device_buffers[3];
+    float *output_buffer_host = nullptr;
+    float *output_seg_buffer_host = nullptr;
+    float *decode_ptr_host = nullptr;
+    float *decode_ptr_device = nullptr;
+
+    // Read images from directory
+    std::vector <std::string> file_names;
+    if (read_files_in_dir(img_dir.c_str(), file_names) < 0) {
+        std::cerr << "read_files_in_dir failed." << std::endl;
+        return -1;
+    }
+
+    std::unordered_map<int, std::string> labels_map;
+    read_labels(labels_filename, labels_map);
+    assert(kNumClass == labels_map.size());
+
+    prepare_buffer(engine, &device_buffers[0], &device_buffers[1], &device_buffers[2], &output_buffer_host,
+                   &output_seg_buffer_host, &decode_ptr_host, &decode_ptr_device, cuda_post_process);
+
+    // // batch predict
+    for (size_t i = 0; i < file_names.size(); i += kBatchSize) {
+        // Get a batch of images
+        std::vector <cv::Mat> img_batch;
+        std::vector <std::string> img_name_batch;
+        for (size_t j = i; j < i + kBatchSize && j < file_names.size(); j++) {
+            cv::Mat img = cv::imread(img_dir + "/" + file_names[j]);
+            img_batch.push_back(img);
+            img_name_batch.push_back(file_names[j]);
+        }
+        // Preprocess
+        cuda_batch_preprocess(img_batch, device_buffers[0], kInputW, kInputH, stream);
+        // Run inference
+        infer(*context, stream, (void **) device_buffers, output_buffer_host, output_seg_buffer_host, kBatchSize,
+              decode_ptr_host, decode_ptr_device, model_bboxes, cuda_post_process);
+        std::vector <std::vector<Detection>> res_batch;
+        if (cuda_post_process == "c") {
+            // NMS
+            batch_nms(res_batch, output_buffer_host, img_batch.size(), kOutputSize, kConfThresh, kNmsThresh);
+            for (size_t b = 0; b < img_batch.size(); b++) {
+                auto &res = res_batch[b];
+                cv::Mat img = img_batch[b];
+                auto masks = process_mask(&output_seg_buffer_host[b * kOutputSegSize], kOutputSegSize, res);
+                draw_mask_bbox(img, res, masks, labels_map);
+                cv::imwrite("_" + img_name_batch[b], img);
+            }
+        } else if (cuda_post_process == "g") {
+            // Process gpu decode and nms results
+            // batch_process(res_batch, decode_ptr_host, img_batch.size(), bbox_element, img_batch);
+            // todo seg in gpu
+            std::cerr << "seg_postprocess is not support in gpu right now" << std::endl;
+        }
+
+        // print results
+        for (size_t j = 0; j < res_batch.size(); j++) {
+            for (size_t k = 0; k < res_batch[j].size(); k++) {
+                std::cout << "image: " << img_name_batch[j] << ", bbox: " << res_batch[j][k].bbox[0] << ", "
+                          << res_batch[j][k].bbox[1] << ", " << res_batch[j][k].bbox[2] << ", "
+                          << res_batch[j][k].bbox[3] << ", conf: " << res_batch[j][k].conf << ", class_id: "
+                          << res_batch[j][k].class_id << std::endl;
+            }
+        }
+    }
+
+    // Release stream and buffers
+    cudaStreamDestroy(stream);
+    CUDA_CHECK(cudaFree(device_buffers[0]));
+    CUDA_CHECK(cudaFree(device_buffers[1]));
+    CUDA_CHECK(cudaFree(device_buffers[2]));
+    CUDA_CHECK(cudaFree(decode_ptr_device));
+    delete[] decode_ptr_host;
+    delete[] output_buffer_host;
+    delete[] output_seg_buffer_host;
+    cuda_preprocess_destroy();
+    // Destroy the engine
+    delete context;
+    delete engine;
+    delete runtime;
+
+    // Print histogram of the output distribution
+    // std::cout << "\nOutput:\n\n";
+    // for (unsigned int i = 0; i < kOutputSize; i++)
+    //{
+    //    std::cout << prob[i] << ", ";
+    //    if (i % 10 == 0) std::cout << std::endl;
+    //}
+    // std::cout << std::endl;
+
+    return 0;
+}
diff --git a/yolov8_seg_trt.py b/yolov8_seg_trt.py
new file mode 100644
index 0000000..e3ed9d6
--- /dev/null
+++ b/yolov8_seg_trt.py
@@ -0,0 +1,590 @@
+"""
+An example that uses TensorRT's Python api to make inferences.
+"""
+import ctypes
+import os
+import shutil
+import random
+import sys
+import threading
+import time
+import cv2
+import numpy as np
+import pycuda.autoinit  # noqa: F401
+import pycuda.driver as cuda
+import tensorrt as trt
+
+CONF_THRESH = 0.5
+IOU_THRESHOLD = 0.4
+POSE_NUM = 17 * 3
+DET_NUM = 6
+SEG_NUM = 32
+
+
+def get_img_path_batches(batch_size, img_dir):
+    ret = []
+    batch = []
+    for root, dirs, files in os.walk(img_dir):
+        for name in files:
+            if len(batch) == batch_size:
+                ret.append(batch)
+                batch = []
+            batch.append(os.path.join(root, name))
+    if len(batch) > 0:
+        ret.append(batch)
+    return ret
+
+
+def plot_one_box(x, img, color=None, label=None, line_thickness=None):
+    """
+    description: Plots one bounding box on image img,
+                 this function comes from YoLov8 project.
+    param:
+        x:      a box likes [x1,y1,x2,y2]
+        img:    a opencv image object
+        color:  color to draw rectangle, such as (0,255,0)
+        label:  str
+        line_thickness: int
+    return:
+        no return
+
+    """
+    tl = (
+            line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
+    )  # line/font thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
+        cv2.putText(
+            img,
+            label,
+            (c1[0], c1[1] - 2),
+            0,
+            tl / 3,
+            [225, 255, 255],
+            thickness=tf,
+            lineType=cv2.LINE_AA,
+        )
+
+
+class YoLov8TRT(object):
+    """
+    description: A YOLOv8 class that warps TensorRT ops, preprocess and postprocess ops.
+    """
+
+    def __init__(self, engine_file_path):
+        # Create a Context on this device,
+        self.ctx = cuda.Device(0).make_context()
+        stream = cuda.Stream()
+        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
+        runtime = trt.Runtime(TRT_LOGGER)
+
+        # Deserialize the engine from file
+        with open(engine_file_path, "rb") as f:
+            engine = runtime.deserialize_cuda_engine(f.read())
+        context = engine.create_execution_context()
+
+        host_inputs = []
+        cuda_inputs = []
+        host_outputs = []
+        cuda_outputs = []
+        input_binding_names = []
+        output_binding_names = []
+
+        for binding_name in engine:
+            shape = engine.get_tensor_shape(binding_name)
+            print('binding_name:', binding_name, shape)
+            size = trt.volume(shape)
+            dtype = trt.nptype(engine.get_tensor_dtype(binding_name))
+            # Allocate host and device buffers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
+            # Append the device buffer to device bindings.
+            # Append to the appropriate list.
+            if engine.get_tensor_mode(binding_name) == trt.TensorIOMode.INPUT:
+                input_binding_names.append(binding_name)
+                self.input_w = shape[-1]
+                self.input_h = shape[-2]
+                host_inputs.append(host_mem)
+                cuda_inputs.append(cuda_mem)
+            elif engine.get_tensor_mode(binding_name) == trt.TensorIOMode.OUTPUT:
+                output_binding_names.append(binding_name)
+                host_outputs.append(host_mem)
+                cuda_outputs.append(cuda_mem)
+            else:
+                print('unknow:', binding_name)
+
+        # Store
+        self.stream = stream
+        self.context = context
+        self.engine = engine
+        self.host_inputs = host_inputs
+        self.cuda_inputs = cuda_inputs
+        self.host_outputs = host_outputs
+        self.cuda_outputs = cuda_outputs
+        self.input_binding_names = input_binding_names
+        self.output_binding_names = output_binding_names
+        self.batch_size = engine.get_tensor_shape(input_binding_names[0])[0]
+        print('batch_size:', self.batch_size)
+
+        # Data length
+        self.det_output_length = host_outputs[0].shape[0]
+        self.seg_output_length = host_outputs[1].shape[0]
+        self.seg_w = int(self.input_w / 4)
+        self.seg_h = int(self.input_h / 4)
+        self.seg_c = int(self.seg_output_length / (self.seg_w * self.seg_w))
+        self.det_row_output_length = self.seg_c + DET_NUM + POSE_NUM
+
+        # Draw mask
+        self.colors_obj = Colors()
+
+    def infer(self, raw_image_generator):
+        threading.Thread.__init__(self)
+        # Make self the active context, pushing it on top of the context stack.
+        self.ctx.push()
+        # Restore
+        stream = self.stream
+        context = self.context
+        host_inputs = self.host_inputs
+        cuda_inputs = self.cuda_inputs
+        host_outputs = self.host_outputs
+        cuda_outputs = self.cuda_outputs
+        input_binding_names = self.input_binding_names
+        output_binding_names = self.output_binding_names
+        # Do image preprocess
+        batch_image_raw = []
+        batch_origin_h = []
+        batch_origin_w = []
+        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
+        for i, image_raw in enumerate(raw_image_generator):
+            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
+            batch_image_raw.append(image_raw)
+            batch_origin_h.append(origin_h)
+            batch_origin_w.append(origin_w)
+            np.copyto(batch_input_image[i], input_image)
+        batch_input_image = np.ascontiguousarray(batch_input_image)
+
+        # Copy input image to host buffer
+        np.copyto(host_inputs[0], batch_input_image.ravel())
+        start = time.time()
+        # Transfer input data  to the GPU.
+        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
+        # Run inference.
+        context.set_tensor_address(input_binding_names[0], cuda_inputs[0])
+        context.set_tensor_address(output_binding_names[0], cuda_outputs[0])
+        context.set_tensor_address(output_binding_names[1], cuda_outputs[1])
+        context.execute_async_v3(stream_handle=stream.handle)
+        # Transfer predictions back from the GPU.
+        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
+        cuda.memcpy_dtoh_async(host_outputs[1], cuda_outputs[1], stream)
+
+        # Synchronize the stream
+        stream.synchronize()
+        end = time.time()
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+        # Here we use the first row of output in that batch_size = 1
+        output = host_outputs[0]
+        output_proto_mask = host_outputs[1]
+        # Do postprocess
+        for i in range(self.batch_size):
+            result_boxes, result_scores, result_classid, result_proto_coef = self.post_process(
+                output[i * self.det_output_length: (i + 1) * self.det_output_length], batch_origin_h[i],
+                batch_origin_w[i]
+            )
+
+            if result_proto_coef.shape[0] == 0:
+                continue
+            result_masks = self.process_mask(output_proto_mask, result_proto_coef, result_boxes, batch_origin_h[i],
+                                             batch_origin_w[i])
+
+            self.draw_mask(result_masks, colors_=[self.colors_obj(x, True) for x in result_classid],
+                           im_src=batch_image_raw[i])
+
+            # Draw rectangles and labels on the original image
+            for j in range(len(result_boxes)):
+                box = result_boxes[j]
+                plot_one_box(
+                    box,
+                    batch_image_raw[i],
+                    label="{}:{:.2f}".format(
+                        categories[int(result_classid[j])], result_scores[j]
+                    ),
+                )
+        return batch_image_raw, end - start
+
+    def destroy(self):
+        # Remove any context from the top of the context stack, deactivating it.
+        self.ctx.pop()
+
+    def get_raw_image(self, image_path_batch):
+        """
+        description: Read an image from image path
+        """
+        for img_path in image_path_batch:
+            yield cv2.imread(img_path)
+
+    def get_raw_image_zeros(self, image_path_batch=None):
+        """
+        description: Ready data for warmup
+        """
+        for _ in range(self.batch_size):
+            yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
+
+    def preprocess_image(self, raw_bgr_image):
+        """
+        description: Convert BGR image to RGB,
+                     resize and pad it to target size, normalize to [0,1],
+                     transform to NCHW format.
+        param:
+            input_image_path: str, image path
+        return:
+            image:  the processed image
+            image_raw: the original image
+            h: original height
+            w: original width
+        """
+        image_raw = raw_bgr_image
+        h, w, c = image_raw.shape
+        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
+        # Calculate widht and height and paddings
+        r_w = self.input_w / w
+        r_h = self.input_h / h
+        if r_h > r_w:
+            tw = self.input_w
+            th = int(r_w * h)
+            tx1 = tx2 = 0
+            ty1 = int((self.input_h - th) / 2)
+            ty2 = self.input_h - th - ty1
+        else:
+            tw = int(r_h * w)
+            th = self.input_h
+            tx1 = int((self.input_w - tw) / 2)
+            tx2 = self.input_w - tw - tx1
+            ty1 = ty2 = 0
+        # Resize the image with long side while maintaining ratio
+        image = cv2.resize(image, (tw, th))
+        # Pad the short side with (128,128,128)
+        image = cv2.copyMakeBorder(
+            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
+        )
+        image = image.astype(np.float32)
+        # Normalize to [0,1]
+        image /= 255.0
+        # HWC to CHW format:
+        image = np.transpose(image, [2, 0, 1])
+        # CHW to NCHW format
+        image = np.expand_dims(image, axis=0)
+        # Convert the image to row-major order, also known as "C order":
+        image = np.ascontiguousarray(image)
+        return image, image_raw, h, w
+
+    def xywh2xyxy(self, origin_h, origin_w, x):
+        """
+        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+        param:
+            origin_h:   height of original image
+            origin_w:   width of original image
+            x:          A boxes numpy, each row is a box [center_x, center_y, w, h]
+        return:
+            y:          A boxes numpy, each row is a box [x1, y1, x2, y2]
+        """
+        y = np.zeros_like(x)
+        r_w = self.input_w / origin_w
+        r_h = self.input_h / origin_h
+        if r_h > r_w:
+            y[:, 0] = x[:, 0]
+            y[:, 2] = x[:, 2]
+            y[:, 1] = x[:, 1] - (self.input_h - r_w * origin_h) / 2
+            y[:, 3] = x[:, 3] - (self.input_h - r_w * origin_h) / 2
+            y /= r_w
+        else:
+            y[:, 0] = x[:, 0] - (self.input_w - r_h * origin_w) / 2
+            y[:, 2] = x[:, 2] - (self.input_w - r_h * origin_w) / 2
+            y[:, 1] = x[:, 1]
+            y[:, 3] = x[:, 3]
+            y /= r_h
+
+        return y
+
+    def post_process(self, output, origin_h, origin_w):
+        """
+        description: postprocess the prediction
+        param:
+            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
+            origin_h:   height of original image
+            origin_w:   width of original image
+        return:
+            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
+            result_scores: finally scores, a numpy, each element is the score correspoing to box
+            result_classid: finally classid, a numpy, each element is the classid correspoing to box
+        """
+        # Get the num of boxes detected
+        num = int(output[0])
+        # Reshape to a two dimentional ndarray
+        pred = np.reshape(output[1:], (-1, self.det_row_output_length))[:num, :]
+
+        # Do nms
+        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
+        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
+        result_scores = boxes[:, 4] if len(boxes) else np.array([])
+        result_classid = boxes[:, 5] if len(boxes) else np.array([])
+        result_proto_coef = boxes[:, DET_NUM:int(DET_NUM + SEG_NUM)] if len(boxes) else np.array([])
+        return result_boxes, result_scores, result_classid, result_proto_coef
+
+    def bbox_iou(self, box1, box2, x1y1x2y2=True):
+        """
+        description: compute the IoU of two bounding boxes
+        param:
+            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
+            x1y1x2y2: select the coordinate format
+        return:
+            iou: computed iou
+        """
+        if not x1y1x2y2:
+            # Transform from center and width to exact coordinates
+            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+        else:
+            # Get the coordinates of bounding boxes
+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
+
+        # Get the coordinates of the intersection rectangle
+        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
+        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
+        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
+        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
+        # Intersection area
+        inter_area = (np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None)
+                      * np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None))
+        # Union Area
+        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
+        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
+
+        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
+
+        return iou
+
+    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
+        """
+        description: Removes detections with lower object confidence score than 'conf_thres' and performs
+        Non-Maximum Suppression to further filter detections.
+        param:
+            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
+            origin_h: original image height
+            origin_w: original image width
+            conf_thres: a confidence threshold to filter detections
+            nms_thres: a iou threshold to filter detections
+        return:
+            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
+        """
+        # Get the boxes that score > CONF_THRESH
+        boxes = prediction[prediction[:, 4] >= conf_thres]
+        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
+        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
+        # clip the coordinates
+        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w - 1)
+        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w - 1)
+        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h - 1)
+        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h - 1)
+        # Object confidence
+        confs = boxes[:, 4]
+        # Sort by the confs
+        boxes = boxes[np.argsort(-confs)]
+        # Perform non-maximum suppression
+        keep_boxes = []
+        while boxes.shape[0]:
+            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
+            label_match = boxes[0, 5] == boxes[:, 5]
+            # Indices of boxes with lower confidence scores, large IOUs and matching labels
+            invalid = large_overlap & label_match
+            keep_boxes += [boxes[0]]
+            boxes = boxes[~invalid]
+        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
+        return boxes
+
+    def sigmoid(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def scale_mask(self, mask, ih, iw):
+        mask = cv2.resize(mask, (self.input_w, self.input_h))
+        r_w = self.input_w / (iw * 1.0)
+        r_h = self.input_h / (ih * 1.0)
+        if r_h > r_w:
+            w = self.input_w
+            h = int(r_w * ih)
+            x = 0
+            y = int((self.input_h - h) / 2)
+        else:
+            w = int(r_h * iw)
+            h = self.input_h
+            x = int((self.input_w - w) / 2)
+            y = 0
+        crop = mask[y:y + h, x:x + w]
+        crop = cv2.resize(crop, (iw, ih))
+        return crop
+
+    def process_mask(self, output_proto_mask, result_proto_coef, result_boxes, ih, iw):
+        """
+        description: Mask pred by yolov8 instance segmentation ,
+        param:
+            output_proto_mask: prototype mask e.g. (32, 160, 160) for 640x640 input
+            result_proto_coef: prototype mask coefficients (n, 32), n represents n results
+            result_boxes     :
+            ih: rows of original image
+            iw: cols of original image
+        return:
+            mask_result: (n, ih, iw)
+        """
+        result_proto_masks = output_proto_mask.reshape(self.seg_c, self.seg_h, self.seg_w)
+        c, mh, mw = result_proto_masks.shape
+        print(result_proto_masks.shape)
+        print(result_proto_coef.shape)
+        masks = self.sigmoid((result_proto_coef @ result_proto_masks.astype(np.float32).reshape(c, -1))).reshape(-1, mh,
+                                                                                                                 mw)
+
+        mask_result = []
+        for mask, box in zip(masks, result_boxes):
+            mask_s = np.zeros((ih, iw))
+            crop_mask = self.scale_mask(mask, ih, iw)
+            x1 = int(box[0])
+            y1 = int(box[1])
+            x2 = int(box[2])
+            y2 = int(box[3])
+            crop = crop_mask[y1:y2, x1:x2]
+            crop = np.where(crop >= 0.5, 1, 0)
+            crop = crop.astype(np.uint8)
+            mask_s[y1:y2, x1:x2] = crop
+
+            mask_result.append(mask_s)
+        mask_result = np.array(mask_result)
+        return mask_result
+
+    def draw_mask(self, masks, colors_, im_src, alpha=0.5):
+        """
+        description: Draw mask on image ,
+        param:
+            masks  : result_mask
+            colors_: color to draw mask
+            im_src : original image
+            alpha  : scale between original  image and mask
+        return:
+            no return
+        """
+        if len(masks) == 0:
+            return
+        masks = np.asarray(masks, dtype=np.uint8)
+        masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
+        masks = np.asarray(masks, dtype=np.float32)
+        colors_ = np.asarray(colors_, dtype=np.float32)
+        s = masks.sum(2, keepdims=True).clip(0, 1)
+        masks = (masks @ colors_).clip(0, 255)
+        im_src[:] = masks * alpha + im_src * (1 - s * alpha)
+
+
+class inferThread(threading.Thread):
+    def __init__(self, yolov8_wrapper, image_path_batch):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+        self.image_path_batch = image_path_batch
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image(self.image_path_batch))
+        for i, img_path in enumerate(self.image_path_batch):
+            parent, filename = os.path.split(img_path)
+            save_name = os.path.join('output', filename)
+            # Save image
+            cv2.imwrite(save_name, batch_image_raw[i])
+        print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
+
+
+class warmUpThread(threading.Thread):
+    def __init__(self, yolov8_wrapper):
+        threading.Thread.__init__(self)
+        self.yolov8_wrapper = yolov8_wrapper
+
+    def run(self):
+        batch_image_raw, use_time = self.yolov8_wrapper.infer(self.yolov8_wrapper.get_raw_image_zeros())
+        print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
+
+
+class Colors:
+    def __init__(self):
+        hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A',
+                '92CC17', '3DDB86', '1A9334', '00D4BB', '2C99A8', '00C2FF',
+                '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF',
+                'FF95C8', 'FF37C7')
+        self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
+        self.n = len(self.palette)
+
+    def __call__(self, i, bgr=False):
+        c = self.palette[int(i) % self.n]
+        return (c[2], c[1], c[0]) if bgr else c
+
+    @staticmethod
+    def hex2rgb(h):  # rgb order (PIL)
+        return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))
+
+
+if __name__ == "__main__":
+    # load custom plugin and engine
+    PLUGIN_LIBRARY = "build/libmyplugins.so"
+    engine_file_path = "yolov8n-seg.engine"
+
+    if len(sys.argv) > 1:
+        engine_file_path = sys.argv[1]
+    if len(sys.argv) > 2:
+        PLUGIN_LIBRARY = sys.argv[2]
+
+    ctypes.CDLL(PLUGIN_LIBRARY)
+
+    # load coco labels
+
+    categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
+                  "traffic light",
+                  "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+                  "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase",
+                  "frisbee",
+                  "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard",
+                  "surfboard",
+                  "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+                  "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                  "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
+                  "cell phone",
+                  "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
+                  "teddy bear",
+                  "hair drier", "toothbrush"]
+
+    if os.path.exists('output/'):
+        shutil.rmtree('output/')
+    os.makedirs('output/')
+    # a YoLov8TRT instance
+    yolov8_wrapper = YoLov8TRT(engine_file_path)
+    try:
+        print('batch size is', yolov8_wrapper.batch_size)
+
+        image_dir = "images/"
+        image_path_batches = get_img_path_batches(yolov8_wrapper.batch_size, image_dir)
+
+        for i in range(10):
+            # create a new thread to do warm_up
+            thread1 = warmUpThread(yolov8_wrapper)
+            thread1.start()
+            thread1.join()
+        for batch in image_path_batches:
+            # create a new thread to do inference
+            thread1 = inferThread(yolov8_wrapper, batch)
+            thread1.start()
+            thread1.join()
+    finally:
+        # destroy the instance
+        yolov8_wrapper.destroy()