diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73503b4e..d8079fcb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -114,6 +114,7 @@ endif()
 include_directories(
     ${CUDA_INCLUDE_DIRS}
     ${CUDNN_ROOT_DIR}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/third_party/cub
 )
 find_library(CUDNN_LIB cudnn HINTS
     ${CUDA_TOOLKIT_ROOT_DIR} ${CUDNN_ROOT_DIR} PATH_SUFFIXES lib64 lib)
diff --git a/README.md b/README.md
index 78d08bb5..c6aff0b2 100644
--- a/README.md
+++ b/README.md
@@ -1,221 +1,107 @@
-[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Documentation](https://img.shields.io/badge/TensorRT-documentation-brightgreen.svg)](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html)
+# TensorRT custom plugin
 
-# TensorRT Open Source Software
-This repository contains the Open Source Software (OSS) components of NVIDIA TensorRT. Included are the sources for TensorRT plugins and parsers (Caffe and ONNX), as well as sample applications demonstrating usage and capabilities of the TensorRT platform. These open source software components are a subset of the TensorRT General Availability (GA) release with some extensions and bug-fixes.
+Just add some new custom tensorRT plugin
 
-* For code contributions to TensorRT-OSS, please see our [Contribution Guide](CONTRIBUTING.md) and [Coding Guidelines](CODING-GUIDELINES.md).
-* For a summary of new additions and updates shipped with TensorRT-OSS releases, please refer to the [Changelog](CHANGELOG.md).
-* For business inquiries, please contact [researchinquiries@nvidia.com](mailto:researchinquiries@nvidia.com)
-* For press and other inquiries, please contact Hector Marinez at [hmarinez@nvidia.com](mailto:hmarinez@nvidia.com)
+## New plugin
 
-Need enterprise support? NVIDIA global support is available for TensorRT with the [NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/). Check out [NVIDIA LaunchPad](https://www.nvidia.com/en-us/launchpad/ai/ai-enterprise/) for free access to a set of hands-on labs with TensorRT hosted on NVIDIA infrastructure.
+- [EfficientNMSLandmark_TRT](./plugin/efficientNMSLandmarkPlugin/): Efficient NMS with face landmark
+- [EfficientNMSCustom_TRT](./plugin/efficientNMSCustomPlugin/): Same Efficient NMS, but return boxes indices
+- [RoIAlignDynamic](./plugin/roIAlignPlugin/): Same ONNX RoIAlign, copy from [MMCVRoIAlign](https://github.com/open-mmlab/mmdeploy)
+- [RoIAlign2Dynamic](./plugin/roIAlign2Plugin/): Same as pyramidROIAlignPlugin, but only one feature_map.
 
-Join the [TensorRT and Triton community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and stay current on the latest product updates, bug fixes, content, best practices, and more.
+## Prerequisites
 
-# Build
+- Deepstream 6.0.1 or Deepstream 6.1
 
-## Prerequisites
-To build the TensorRT-OSS components, you will first need the following software packages.
-
-**TensorRT GA build**
-* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v8.4.3.1
-
-**System Packages**
-* [CUDA](https://developer.nvidia.com/cuda-toolkit)
-  * Recommended versions:
-  * cuda-11.6.x + cuDNN-8.4
-  * cuda-10.2 + cuDNN-8.4
-* [GNU make](https://ftp.gnu.org/gnu/make/) >= v4.1
-* [cmake](https://github.com/Kitware/CMake/releases) >= v3.13
-* [python](<https://www.python.org/downloads/>) >= v3.6.9
-* [pip](https://pypi.org/project/pip/#history) >= v19.0
-* Essential utilities
-  * [git](https://git-scm.com/downloads), [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/), [wget](https://www.gnu.org/software/wget/faq.html#download)
-
-**Optional Packages**
-* Containerized build
-  * [Docker](https://docs.docker.com/install/) >= 19.03
-  * [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-docker)
-* Toolchains and SDKs
-  * (Cross compilation for Jetson platform) [NVIDIA JetPack](https://developer.nvidia.com/embedded/jetpack) >= 5.0 (current support only for TensorRT 8.4.0)
-  * (For Windows builds) [Visual Studio](https://visualstudio.microsoft.com/vs/older-downloads/) 2017 Community or Enterprise edition
-  * (Cross compilation for QNX platform) [QNX Toolchain](https://blackberry.qnx.com/en)
-* PyPI packages (for demo applications/tests)
-  * [onnx](https://pypi.org/project/onnx/) 1.9.0
-  * [onnxruntime](https://pypi.org/project/onnxruntime/) 1.8.0
-  * [tensorflow-gpu](https://pypi.org/project/tensorflow/) >= 2.5.1
-  * [Pillow](https://pypi.org/project/Pillow/) >= 9.0.1
-  * [pycuda](https://pypi.org/project/pycuda/) < 2021.1
-  * [numpy](https://pypi.org/project/numpy/)
-  * [pytest](https://pypi.org/project/pytest/)
-* Code formatting tools (for contributors)
-  * [Clang-format](https://clang.llvm.org/docs/ClangFormat.html)
-  * [Git-clang-format](https://github.com/llvm-mirror/clang/blob/master/tools/clang-format/git-clang-format)
-
-  > NOTE: [onnx-tensorrt](https://github.com/onnx/onnx-tensorrt), [cub](http://nvlabs.github.io/cub/), and [protobuf](https://github.com/protocolbuffers/protobuf.git) packages are downloaded along with TensorRT OSS, and not required to be installed.
-
-## Downloading TensorRT Build
-
-1. #### Download TensorRT OSS
-	```bash
-	git clone -b master https://github.com/nvidia/TensorRT TensorRT
-	cd TensorRT
-	git submodule update --init --recursive
-	```
-
-2. #### (Optional - if not using TensorRT container) Specify the TensorRT GA release build
-
-    If using the TensorRT OSS build container, TensorRT libraries are preinstalled under `/usr/lib/x86_64-linux-gnu` and you may skip this step.
-
-    Else download and extract the TensorRT GA build from [NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-tensorrt-download).
-
-    **Example: Ubuntu 20.04 on x86-64 with cuda-11.6.2**
-
-    ```bash
-    cd ~/Downloads
-    tar -xvzf TensorRT-8.4.3.1.Linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz
-    export TRT_LIBPATH=`pwd`/TensorRT-8.4.3.1
-    ```
-
-    **Example: Windows on x86-64 with cuda-11.4**
-
-    ```powershell
-    cd ~\Downloads
-    Expand-Archive .\TensorRT-8.4.3.1.Windows10.x86_64.cuda-11.6.cudnn8.4.zip
-    $Env:TRT_LIBPATH = '$(Get-Location)\TensorRT-8.4.3.1'
-    $Env:PATH += 'C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\'
-    ```
-
-
-3. #### (Optional - for Jetson builds only) Download the JetPack SDK
-    1. Download and launch the JetPack SDK manager. Login with your NVIDIA developer account.
-    2. Select the  platform and target OS  (example: Jetson AGX Xavier, `Linux Jetpack 5.0`), and click Continue.
-    3. Under `Download & Install Options` change the download folder and select `Download now, Install later`. Agree to the license terms and click Continue.
-    4. Move the extracted files into the `<TensorRT-OSS>/docker/jetpack_files` folder.
-
-
-## Setting Up The Build Environment
-
-For Linux platforms, we recommend that you generate a docker container for building TensorRT OSS as described below. For native builds, on Windows for example, please install the [prerequisite](#prerequisites) *System Packages*.
-
-1. #### Generate the TensorRT-OSS build container.
-    The TensorRT-OSS build container can be generated using the supplied Dockerfiles and build script. The build container is configured for building TensorRT OSS out-of-the-box.
-
-    **Example: Ubuntu 20.04 on x86-64 with cuda-11.6.2 (default)**
-    ```bash
-    ./docker/build.sh --file docker/ubuntu-20.04.Dockerfile --tag tensorrt-ubuntu20.04-cuda11.6
-    ```
-    **Example: CentOS/RedHat 7 on x86-64 with cuda-10.2**
-    ```bash
-    ./docker/build.sh --file docker/centos-7.Dockerfile --tag tensorrt-centos7-cuda10.2 --cuda 10.2
-    ```
-    **Example: Ubuntu 20.04 cross-compile for Jetson (aarch64) with cuda-11.4.2 (JetPack SDK)**
-    ```bash
-    ./docker/build.sh --file docker/ubuntu-cross-aarch64.Dockerfile --tag tensorrt-jetpack-cuda11.4
-    ```
-    **Example: Ubuntu 20.04 on aarch64 with cuda-11.4.2**
-    ```bash
-    ./docker/build.sh --file docker/ubuntu-20.04-aarch64.Dockerfile --tag tensorrt-aarch64-ubuntu20.04-cuda11.4
-    ```
-
-2. #### Launch the TensorRT-OSS build container.
-    **Example: Ubuntu 20.04 build container**
-	```bash
-	./docker/launch.sh --tag tensorrt-ubuntu20.04-cuda11.6 --gpus all
-	```
-	> NOTE:
-	1. Use the `--tag` corresponding to build container generated in Step 1.
-	2. [NVIDIA Container Toolkit](#prerequisites) is required for GPU access (running TensorRT applications) inside the build container.
-	3. `sudo` password for Ubuntu build containers is 'nvidia'.
-	4. Specify port number using `--jupyter <port>` for launching Jupyter notebooks.
-
-## Building TensorRT-OSS
-* Generate Makefiles or VS project (Windows) and build.
-
-    **Example: Linux (x86-64) build with default cuda-11.6.2**
-	```bash
-	cd $TRT_OSSPATH
-	mkdir -p build && cd build
-	cmake .. -DTRT_LIB_DIR=$TRT_LIBPATH -DTRT_OUT_DIR=`pwd`/out
-	make -j$(nproc)
-	```
-
-    > NOTE: On CentOS7, the default g++ version does not support C++14. For native builds (not using the CentOS7 build container), first install devtoolset-8 to obtain the updated g++ toolchain as follows:
-    ```bash
-    yum -y install centos-release-scl
-    yum-config-manager --enable rhel-server-rhscl-7-rpms
-    yum -y install devtoolset-8
-    export PATH="/opt/rh/devtoolset-8/root/bin:${PATH}
-    ```
-
-    **Example: Linux (aarch64) build with default cuda-11.6.2**
-	```bash
-	cd $TRT_OSSPATH
-	mkdir -p build && cd build
-	cmake .. -DTRT_LIB_DIR=$TRT_LIBPATH -DTRT_OUT_DIR=`pwd`/out -DCMAKE_TOOLCHAIN_FILE=$TRT_OSSPATH/cmake/toolchains/cmake_aarch64-native.toolchain
-	make -j$(nproc)
-	```
-
-    **Example: Native build on Jetson (aarch64) with cuda-11.4**
-    ```bash
-    cd $TRT_OSSPATH
-    mkdir -p build && cd build
-    cmake .. -DTRT_LIB_DIR=$TRT_LIBPATH -DTRT_OUT_DIR=`pwd`/out -DTRT_PLATFORM_ID=aarch64 -DCUDA_VERSION=11.4
-    CC=/usr/bin/gcc make -j$(nproc)
-    ```
-    > NOTE: C compiler must be explicitly specified via `CC=` for native `aarch64` builds of protobuf.
-
-    **Example: Ubuntu 20.04 Cross-Compile for Jetson (aarch64) with cuda-11.4 (JetPack)**
-	```bash
-	cd $TRT_OSSPATH
-	mkdir -p build && cd build
-	cmake .. -DCMAKE_TOOLCHAIN_FILE=$TRT_OSSPATH/cmake/toolchains/cmake_aarch64.toolchain -DCUDA_VERSION=11.4 -DCUDNN_LIB=/pdk_files/cudnn/usr/lib/aarch64-linux-gnu/libcudnn.so -DCUBLAS_LIB=/usr/local/cuda-11.4/targets/aarch64-linux/lib/stubs/libcublas.so -DCUBLASLT_LIB=/usr/local/cuda-11.4/targets/aarch64-linux/lib/stubs/libcublasLt.so
-
-	make -j$(nproc)
-	```
-    > NOTE: The latest JetPack SDK v5.0 only supports TensorRT 8.4.0.
-
-    **Example: Windows (x86-64) build in Powershell**
-	```powershell
-	cd $Env:TRT_OSSPATH
-	mkdir -p build ; cd build
-	cmake .. -DTRT_LIB_DIR=$Env:TRT_LIBPATH -DTRT_OUT_DIR='$(Get-Location)\out' -DCMAKE_TOOLCHAIN_FILE=..\cmake\toolchains\cmake_x64_win.toolchain
-	msbuild ALL_BUILD.vcxproj
-	```
-	> NOTE:
-	1. The default CUDA version used by CMake is 11.6.2. To override this, for example to 10.2, append `-DCUDA_VERSION=10.2` to the cmake command.
-	2. If samples fail to link on CentOS7, create this symbolic link: `ln -s $TRT_OUT_DIR/libnvinfer_plugin.so $TRT_OUT_DIR/libnvinfer_plugin.so.8`
-* Required CMake build arguments are:
-	- `TRT_LIB_DIR`: Path to the TensorRT installation directory containing libraries.
-	- `TRT_OUT_DIR`: Output directory where generated build artifacts will be copied.
-* Optional CMake build arguments:
-	- `CMAKE_BUILD_TYPE`: Specify if binaries generated are for release or debug (contain debug symbols). Values consists of [`Release`] | `Debug`
-	- `CUDA_VERISON`: The version of CUDA to target, for example [`11.6.2`].
-	- `CUDNN_VERSION`: The version of cuDNN to target, for example [`8.4`].
-	- `PROTOBUF_VERSION`:  The version of Protobuf to use, for example [`3.0.0`]. Note: Changing this will not configure CMake to use a system version of Protobuf, it will configure CMake to download and try building that version.
-	- `CMAKE_TOOLCHAIN_FILE`: The path to a toolchain file for cross compilation.
-	- `BUILD_PARSERS`: Specify if the parsers should be built, for example [`ON`] | `OFF`.  If turned OFF, CMake will try to find precompiled versions of the parser libraries to use in compiling samples. First in `${TRT_LIB_DIR}`, then on the system. If the build type is Debug, then it will prefer debug builds of the libraries before release versions if available.
-	- `BUILD_PLUGINS`: Specify if the plugins should be built, for example [`ON`] | `OFF`. If turned OFF, CMake will try to find a precompiled version of the plugin library to use in compiling samples. First in `${TRT_LIB_DIR}`, then on the system. If the build type is Debug, then it will prefer debug builds of the libraries before release versions if available.
-	- `BUILD_SAMPLES`: Specify if the samples should be built, for example [`ON`] | `OFF`.
-	- `GPU_ARCHS`: GPU (SM) architectures to target. By default we generate CUDA code for all major SMs. Specific SM versions can be specified here as a quoted space-separated list to reduce compilation time and binary size. Table of compute capabilities of NVIDIA GPUs can be found [here](https://developer.nvidia.com/cuda-gpus). Examples:
-        - NVidia A100: `-DGPU_ARCHS="80"`
-        - Tesla T4, GeForce RTX 2080: `-DGPU_ARCHS="75"`
-        - Titan V, Tesla V100: `-DGPU_ARCHS="70"`
-        - Multiple SMs: `-DGPU_ARCHS="80 75"`
-	- `TRT_PLATFORM_ID`: Bare-metal build (unlike containerized cross-compilation) on non Linux/x86 platforms must explicitly specify the target platform. Currently supported options: `x86_64` (default), `aarch64`
-
-# References
-
-## TensorRT Resources
-
-* [TensorRT Developer Home](https://developer.nvidia.com/tensorrt)
-* [TensorRT QuickStart Guide](https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html)
-* [TensorRT Developer Guide](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html)
-* [TensorRT Sample Support Guide](https://docs.nvidia.com/deeplearning/tensorrt/sample-support-guide/index.html)
-* [TensorRT ONNX Tools](https://docs.nvidia.com/deeplearning/tensorrt/index.html#tools)
-* [TensorRT Discussion Forums](https://devtalk.nvidia.com/default/board/304/tensorrt/)
-* [TensorRT Release Notes](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
-
-## Known Issues
-
-* Please refer to [TensorRT 8.4 Release Notes](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/tensorrt-8.html#tensorrt-8)
+## Install
+
+Follow guide from
+
+Please refer to the guide under [github.com/NVIDIA-AI-IOT/deepstream_tao_apps](https://github.com/NVIDIA-AI-IOT/deepstream_tao_apps/blob/master/TRT-OSS/x86/README.md)
+
+### 1. Installl Cmake (>= 3.13)
+
+TensorRT OSS requires cmake >= v3.13, so install cmake 3.13 if your cmake version is lower than 3.13
+
+```
+wget https://github.com/Kitware/CMake/releases/download/v3.19.4/cmake-3.19.4.tar.gz
+tar xvf cmake-3.19.4.tar.gz
+cd cmake-3.19.4/
+mkdir $HOME/install
+./configure --prefix=$HOME/install
+make -j$(nproc)
+sudo make install
+```
+
+### 2. Build TensorRT OSS Plugin
+
+| DeepStream Release | TRT Version | TRT_OSS_CHECKOUT_TAG | Support |
+| ------------------ | ----------- | -------------------- | ------- |
+| 5.0                | TRT 7.0.0   | release/7.0          | No      |
+| 5.0.1              | TRT 7.0.0   | release/7.0          | No      |
+| 5.1                | TRT 7.2.X   | 21.03                | No      |
+| 6.0 EA             | TRT 7.2.2   | 21.03                | No      |
+| 6.0 GA             | TRT 8.0.1   | release/8.0          | No      |
+| 6.0.1              | TRT 8.2.1   | release/8.2          | Yes     |
+| 6.1                | TRT 8.2.5.1 | release/8.2          | Yes     |
+
+```
+git clone -b release/8.2 https://github.com/hiennguyen9874/TensorRT
+cd TensorRT/
+git submodule update --init --recursive
+export TRT_SOURCE=`pwd`
+cd $TRT_SOURCE
+mkdir -p build && cd build
+## NOTE: as mentioned above, please make sure your GPU_ARCHS in TRT OSS CMakeLists.txt
+## if GPU_ARCHS is not in TRT OSS CMakeLists.txt, add -DGPU_ARCHS=xy as below, for xy, refer to below "How to Get GPU_ARCHS" section
+$HOME/install/bin/cmake .. -DGPU_ARCHS=xy  -DTRT_LIB_DIR=/usr/lib/x86_64-linux-gnu/ -DCMAKE_C_COMPILER=/usr/bin/gcc -DTRT_BIN_DIR=`pwd`/out
+make nvinfer_plugin -j$(nproc)
+```
+
+After building ends successfully, libnvinfer_plugin.so\* will be generated under `pwd`/out/ or ./build.
+
+### 3. Replace "libnvinfer_plugin.so\*"
+
+```
+// backup original libnvinfer_plugin.so.x.y, e.g. libnvinfer_plugin.so.8.0.0
+sudo mv /usr/lib/x86_64-linux-gnu/libnvinfer_plugin.so.8.p.q ${HOME}/libnvinfer_plugin.so.8.p.q.bak
+// only replace the real file, don't touch the link files, e.g. libnvinfer_plugin.so, libnvinfer_plugin.so.8
+sudo cp $TRT_SOURCE/`pwd`/out/libnvinfer_plugin.so.8.m.n  /usr/lib/x86_64-linux-gnu/libnvinfer_plugin.so.8.p.q
+sudo ldconfig
+```
+
+## How to Get GPU_ARCHS
+
+Can use either method to get GPU_ARCHs
+
+1. GPU_ARCHS value can be got by "deviceQuery" CUDA sample
+
+```
+cd /usr/local/cuda/samples/1_Utilities/deviceQuery
+sudo make
+./deviceQuery
+```
+
+2. If there is not "/usr/local/cuda/samples" in your system, you could use the deviceQuery.cpp in this folder,
+
+```
+nvcc deviceQuery.cpp -o deviceQuery
+./deviceQuery
+```
+
+There will be output like below, which indicates the "GPU_ARCHS" is **75**.
+
+```
+./deviceQuery
+
+Detected 2 CUDA Capable device(s)
+
+Device 0: "Tesla T4"
+  CUDA Driver Version / Runtime Version          10.2 / 10.2
+  CUDA Capability Major/Minor version number:    7.5
+```
+
+# Acknowledgments
+
+- [NNDam/TensorRT-CPP](https://github.com/NNDam/TensorRT-CPP)
+- [MMCVRoIAlign](https://github.com/open-mmlab/mmdeploy)
diff --git a/README_ORIGIN.md b/README_ORIGIN.md
new file mode 100644
index 00000000..78d08bb5
--- /dev/null
+++ b/README_ORIGIN.md
@@ -0,0 +1,221 @@
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Documentation](https://img.shields.io/badge/TensorRT-documentation-brightgreen.svg)](https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html)
+
+# TensorRT Open Source Software
+This repository contains the Open Source Software (OSS) components of NVIDIA TensorRT. Included are the sources for TensorRT plugins and parsers (Caffe and ONNX), as well as sample applications demonstrating usage and capabilities of the TensorRT platform. These open source software components are a subset of the TensorRT General Availability (GA) release with some extensions and bug-fixes.
+
+* For code contributions to TensorRT-OSS, please see our [Contribution Guide](CONTRIBUTING.md) and [Coding Guidelines](CODING-GUIDELINES.md).
+* For a summary of new additions and updates shipped with TensorRT-OSS releases, please refer to the [Changelog](CHANGELOG.md).
+* For business inquiries, please contact [researchinquiries@nvidia.com](mailto:researchinquiries@nvidia.com)
+* For press and other inquiries, please contact Hector Marinez at [hmarinez@nvidia.com](mailto:hmarinez@nvidia.com)
+
+Need enterprise support? NVIDIA global support is available for TensorRT with the [NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/). Check out [NVIDIA LaunchPad](https://www.nvidia.com/en-us/launchpad/ai/ai-enterprise/) for free access to a set of hands-on labs with TensorRT hosted on NVIDIA infrastructure.
+
+Join the [TensorRT and Triton community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and stay current on the latest product updates, bug fixes, content, best practices, and more.
+
+# Build
+
+## Prerequisites
+To build the TensorRT-OSS components, you will first need the following software packages.
+
+**TensorRT GA build**
+* [TensorRT](https://developer.nvidia.com/nvidia-tensorrt-download) v8.4.3.1
+
+**System Packages**
+* [CUDA](https://developer.nvidia.com/cuda-toolkit)
+  * Recommended versions:
+  * cuda-11.6.x + cuDNN-8.4
+  * cuda-10.2 + cuDNN-8.4
+* [GNU make](https://ftp.gnu.org/gnu/make/) >= v4.1
+* [cmake](https://github.com/Kitware/CMake/releases) >= v3.13
+* [python](<https://www.python.org/downloads/>) >= v3.6.9
+* [pip](https://pypi.org/project/pip/#history) >= v19.0
+* Essential utilities
+  * [git](https://git-scm.com/downloads), [pkg-config](https://www.freedesktop.org/wiki/Software/pkg-config/), [wget](https://www.gnu.org/software/wget/faq.html#download)
+
+**Optional Packages**
+* Containerized build
+  * [Docker](https://docs.docker.com/install/) >= 19.03
+  * [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-docker)
+* Toolchains and SDKs
+  * (Cross compilation for Jetson platform) [NVIDIA JetPack](https://developer.nvidia.com/embedded/jetpack) >= 5.0 (current support only for TensorRT 8.4.0)
+  * (For Windows builds) [Visual Studio](https://visualstudio.microsoft.com/vs/older-downloads/) 2017 Community or Enterprise edition
+  * (Cross compilation for QNX platform) [QNX Toolchain](https://blackberry.qnx.com/en)
+* PyPI packages (for demo applications/tests)
+  * [onnx](https://pypi.org/project/onnx/) 1.9.0
+  * [onnxruntime](https://pypi.org/project/onnxruntime/) 1.8.0
+  * [tensorflow-gpu](https://pypi.org/project/tensorflow/) >= 2.5.1
+  * [Pillow](https://pypi.org/project/Pillow/) >= 9.0.1
+  * [pycuda](https://pypi.org/project/pycuda/) < 2021.1
+  * [numpy](https://pypi.org/project/numpy/)
+  * [pytest](https://pypi.org/project/pytest/)
+* Code formatting tools (for contributors)
+  * [Clang-format](https://clang.llvm.org/docs/ClangFormat.html)
+  * [Git-clang-format](https://github.com/llvm-mirror/clang/blob/master/tools/clang-format/git-clang-format)
+
+  > NOTE: [onnx-tensorrt](https://github.com/onnx/onnx-tensorrt), [cub](http://nvlabs.github.io/cub/), and [protobuf](https://github.com/protocolbuffers/protobuf.git) packages are downloaded along with TensorRT OSS, and not required to be installed.
+
+## Downloading TensorRT Build
+
+1. #### Download TensorRT OSS
+	```bash
+	git clone -b master https://github.com/nvidia/TensorRT TensorRT
+	cd TensorRT
+	git submodule update --init --recursive
+	```
+
+2. #### (Optional - if not using TensorRT container) Specify the TensorRT GA release build
+
+    If using the TensorRT OSS build container, TensorRT libraries are preinstalled under `/usr/lib/x86_64-linux-gnu` and you may skip this step.
+
+    Else download and extract the TensorRT GA build from [NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-tensorrt-download).
+
+    **Example: Ubuntu 20.04 on x86-64 with cuda-11.6.2**
+
+    ```bash
+    cd ~/Downloads
+    tar -xvzf TensorRT-8.4.3.1.Linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz
+    export TRT_LIBPATH=`pwd`/TensorRT-8.4.3.1
+    ```
+
+    **Example: Windows on x86-64 with cuda-11.4**
+
+    ```powershell
+    cd ~\Downloads
+    Expand-Archive .\TensorRT-8.4.3.1.Windows10.x86_64.cuda-11.6.cudnn8.4.zip
+    $Env:TRT_LIBPATH = '$(Get-Location)\TensorRT-8.4.3.1'
+    $Env:PATH += 'C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\'
+    ```
+
+
+3. #### (Optional - for Jetson builds only) Download the JetPack SDK
+    1. Download and launch the JetPack SDK manager. Login with your NVIDIA developer account.
+    2. Select the  platform and target OS  (example: Jetson AGX Xavier, `Linux Jetpack 5.0`), and click Continue.
+    3. Under `Download & Install Options` change the download folder and select `Download now, Install later`. Agree to the license terms and click Continue.
+    4. Move the extracted files into the `<TensorRT-OSS>/docker/jetpack_files` folder.
+
+
+## Setting Up The Build Environment
+
+For Linux platforms, we recommend that you generate a docker container for building TensorRT OSS as described below. For native builds, on Windows for example, please install the [prerequisite](#prerequisites) *System Packages*.
+
+1. #### Generate the TensorRT-OSS build container.
+    The TensorRT-OSS build container can be generated using the supplied Dockerfiles and build script. The build container is configured for building TensorRT OSS out-of-the-box.
+
+    **Example: Ubuntu 20.04 on x86-64 with cuda-11.6.2 (default)**
+    ```bash
+    ./docker/build.sh --file docker/ubuntu-20.04.Dockerfile --tag tensorrt-ubuntu20.04-cuda11.6
+    ```
+    **Example: CentOS/RedHat 7 on x86-64 with cuda-10.2**
+    ```bash
+    ./docker/build.sh --file docker/centos-7.Dockerfile --tag tensorrt-centos7-cuda10.2 --cuda 10.2
+    ```
+    **Example: Ubuntu 20.04 cross-compile for Jetson (aarch64) with cuda-11.4.2 (JetPack SDK)**
+    ```bash
+    ./docker/build.sh --file docker/ubuntu-cross-aarch64.Dockerfile --tag tensorrt-jetpack-cuda11.4
+    ```
+    **Example: Ubuntu 20.04 on aarch64 with cuda-11.4.2**
+    ```bash
+    ./docker/build.sh --file docker/ubuntu-20.04-aarch64.Dockerfile --tag tensorrt-aarch64-ubuntu20.04-cuda11.4
+    ```
+
+2. #### Launch the TensorRT-OSS build container.
+    **Example: Ubuntu 20.04 build container**
+	```bash
+	./docker/launch.sh --tag tensorrt-ubuntu20.04-cuda11.6 --gpus all
+	```
+	> NOTE:
+	1. Use the `--tag` corresponding to build container generated in Step 1.
+	2. [NVIDIA Container Toolkit](#prerequisites) is required for GPU access (running TensorRT applications) inside the build container.
+	3. `sudo` password for Ubuntu build containers is 'nvidia'.
+	4. Specify port number using `--jupyter <port>` for launching Jupyter notebooks.
+
+## Building TensorRT-OSS
+* Generate Makefiles or VS project (Windows) and build.
+
+    **Example: Linux (x86-64) build with default cuda-11.6.2**
+	```bash
+	cd $TRT_OSSPATH
+	mkdir -p build && cd build
+	cmake .. -DTRT_LIB_DIR=$TRT_LIBPATH -DTRT_OUT_DIR=`pwd`/out
+	make -j$(nproc)
+	```
+
+    > NOTE: On CentOS7, the default g++ version does not support C++14. For native builds (not using the CentOS7 build container), first install devtoolset-8 to obtain the updated g++ toolchain as follows:
+    ```bash
+    yum -y install centos-release-scl
+    yum-config-manager --enable rhel-server-rhscl-7-rpms
+    yum -y install devtoolset-8
+    export PATH="/opt/rh/devtoolset-8/root/bin:${PATH}
+    ```
+
+    **Example: Linux (aarch64) build with default cuda-11.6.2**
+	```bash
+	cd $TRT_OSSPATH
+	mkdir -p build && cd build
+	cmake .. -DTRT_LIB_DIR=$TRT_LIBPATH -DTRT_OUT_DIR=`pwd`/out -DCMAKE_TOOLCHAIN_FILE=$TRT_OSSPATH/cmake/toolchains/cmake_aarch64-native.toolchain
+	make -j$(nproc)
+	```
+
+    **Example: Native build on Jetson (aarch64) with cuda-11.4**
+    ```bash
+    cd $TRT_OSSPATH
+    mkdir -p build && cd build
+    cmake .. -DTRT_LIB_DIR=$TRT_LIBPATH -DTRT_OUT_DIR=`pwd`/out -DTRT_PLATFORM_ID=aarch64 -DCUDA_VERSION=11.4
+    CC=/usr/bin/gcc make -j$(nproc)
+    ```
+    > NOTE: C compiler must be explicitly specified via `CC=` for native `aarch64` builds of protobuf.
+
+    **Example: Ubuntu 20.04 Cross-Compile for Jetson (aarch64) with cuda-11.4 (JetPack)**
+	```bash
+	cd $TRT_OSSPATH
+	mkdir -p build && cd build
+	cmake .. -DCMAKE_TOOLCHAIN_FILE=$TRT_OSSPATH/cmake/toolchains/cmake_aarch64.toolchain -DCUDA_VERSION=11.4 -DCUDNN_LIB=/pdk_files/cudnn/usr/lib/aarch64-linux-gnu/libcudnn.so -DCUBLAS_LIB=/usr/local/cuda-11.4/targets/aarch64-linux/lib/stubs/libcublas.so -DCUBLASLT_LIB=/usr/local/cuda-11.4/targets/aarch64-linux/lib/stubs/libcublasLt.so
+
+	make -j$(nproc)
+	```
+    > NOTE: The latest JetPack SDK v5.0 only supports TensorRT 8.4.0.
+
+    **Example: Windows (x86-64) build in Powershell**
+	```powershell
+	cd $Env:TRT_OSSPATH
+	mkdir -p build ; cd build
+	cmake .. -DTRT_LIB_DIR=$Env:TRT_LIBPATH -DTRT_OUT_DIR='$(Get-Location)\out' -DCMAKE_TOOLCHAIN_FILE=..\cmake\toolchains\cmake_x64_win.toolchain
+	msbuild ALL_BUILD.vcxproj
+	```
+	> NOTE:
+	1. The default CUDA version used by CMake is 11.6.2. To override this, for example to 10.2, append `-DCUDA_VERSION=10.2` to the cmake command.
+	2. If samples fail to link on CentOS7, create this symbolic link: `ln -s $TRT_OUT_DIR/libnvinfer_plugin.so $TRT_OUT_DIR/libnvinfer_plugin.so.8`
+* Required CMake build arguments are:
+	- `TRT_LIB_DIR`: Path to the TensorRT installation directory containing libraries.
+	- `TRT_OUT_DIR`: Output directory where generated build artifacts will be copied.
+* Optional CMake build arguments:
+	- `CMAKE_BUILD_TYPE`: Specify if binaries generated are for release or debug (contain debug symbols). Values consists of [`Release`] | `Debug`
+	- `CUDA_VERISON`: The version of CUDA to target, for example [`11.6.2`].
+	- `CUDNN_VERSION`: The version of cuDNN to target, for example [`8.4`].
+	- `PROTOBUF_VERSION`:  The version of Protobuf to use, for example [`3.0.0`]. Note: Changing this will not configure CMake to use a system version of Protobuf, it will configure CMake to download and try building that version.
+	- `CMAKE_TOOLCHAIN_FILE`: The path to a toolchain file for cross compilation.
+	- `BUILD_PARSERS`: Specify if the parsers should be built, for example [`ON`] | `OFF`.  If turned OFF, CMake will try to find precompiled versions of the parser libraries to use in compiling samples. First in `${TRT_LIB_DIR}`, then on the system. If the build type is Debug, then it will prefer debug builds of the libraries before release versions if available.
+	- `BUILD_PLUGINS`: Specify if the plugins should be built, for example [`ON`] | `OFF`. If turned OFF, CMake will try to find a precompiled version of the plugin library to use in compiling samples. First in `${TRT_LIB_DIR}`, then on the system. If the build type is Debug, then it will prefer debug builds of the libraries before release versions if available.
+	- `BUILD_SAMPLES`: Specify if the samples should be built, for example [`ON`] | `OFF`.
+	- `GPU_ARCHS`: GPU (SM) architectures to target. By default we generate CUDA code for all major SMs. Specific SM versions can be specified here as a quoted space-separated list to reduce compilation time and binary size. Table of compute capabilities of NVIDIA GPUs can be found [here](https://developer.nvidia.com/cuda-gpus). Examples:
+        - NVidia A100: `-DGPU_ARCHS="80"`
+        - Tesla T4, GeForce RTX 2080: `-DGPU_ARCHS="75"`
+        - Titan V, Tesla V100: `-DGPU_ARCHS="70"`
+        - Multiple SMs: `-DGPU_ARCHS="80 75"`
+	- `TRT_PLATFORM_ID`: Bare-metal build (unlike containerized cross-compilation) on non Linux/x86 platforms must explicitly specify the target platform. Currently supported options: `x86_64` (default), `aarch64`
+
+# References
+
+## TensorRT Resources
+
+* [TensorRT Developer Home](https://developer.nvidia.com/tensorrt)
+* [TensorRT QuickStart Guide](https://docs.nvidia.com/deeplearning/tensorrt/quick-start-guide/index.html)
+* [TensorRT Developer Guide](https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html)
+* [TensorRT Sample Support Guide](https://docs.nvidia.com/deeplearning/tensorrt/sample-support-guide/index.html)
+* [TensorRT ONNX Tools](https://docs.nvidia.com/deeplearning/tensorrt/index.html#tools)
+* [TensorRT Discussion Forums](https://devtalk.nvidia.com/default/board/304/tensorrt/)
+* [TensorRT Release Notes](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
+
+## Known Issues
+
+* Please refer to [TensorRT 8.4 Release Notes](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/tensorrt-8.html#tensorrt-8)
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 77117276..680c29fd 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -40,6 +40,8 @@ set(PLUGIN_LISTS
     disentangledAttentionPlugin
     efficientNMSPlugin
     efficientNMSPlugin/tftrt
+    efficientNMSCustomPlugin
+    efficientNMSLandmarkPlugin
     flattenConcat
     generateDetectionPlugin
     gridAnchorPlugin
@@ -64,6 +66,8 @@ set(PLUGIN_LISTS
     specialSlicePlugin
     splitPlugin
     voxelGeneratorPlugin
+    roIAlignPlugin
+    roIAlign2Plugin
     )
 
 # Add BERT sources if ${BERT_GENCODES} was populated
diff --git a/plugin/README.md b/plugin/README.md
index acbc417b..76daef75 100644
--- a/plugin/README.md
+++ b/plugin/README.md
@@ -44,7 +44,10 @@
 | [specialSlicePlugin](specialSlicePlugin) | SpecialSlice_TRT | 1 |
 | [splitPlugin](splitPlugin) | Split | 1 |
 | [voxelGeneratorPlugin](voxelGeneratorPlugin) | VoxelGeneratorPlugin | 1 |
-
+| [efficientNMSLandmarkPlugin](efficientNMSLandmarkPlugin) | EfficientNMSLandmark_TRT | 1 |
+| [efficientNMSCustomPlugin](efficientNMSCustomPlugin) | EfficientNMSCustom_TRT | 1 |
+| [roIAlignPlugin](roIAlignPlugin) | RoIAlignDynamic_TRT | 1 |
+| [roIAlign2Plugin](roIAlign2Plugin) | RoIAlign2Dynamic_TRT | 1 |
 ## Known Limitations
 
   - None
diff --git a/plugin/api/InferPlugin.cpp b/plugin/api/InferPlugin.cpp
index 0f7f7cb2..7074ca8c 100644
--- a/plugin/api/InferPlugin.cpp
+++ b/plugin/api/InferPlugin.cpp
@@ -37,6 +37,8 @@ using namespace nvinfer1::plugin;
 #include "efficientNMSPlugin.h"
 #include "tftrt/efficientNMSImplicitTFTRTPlugin.h"
 #include "tftrt/efficientNMSExplicitTFTRTPlugin.h"
+#include "efficientNMSLandmarkPlugin.h"
+#include "efficientNMSCustomPlugin.h"
 #include "flattenConcat.h"
 #include "generateDetectionPlugin.h"
 #include "gridAnchorPlugin.h"
@@ -60,6 +62,8 @@ using namespace nvinfer1::plugin;
 #include "specialSlicePlugin.h"
 #include "split.h"
 #include "voxelGenerator.h"
+#include "roIAlignPlugin.h"
+#include "roIAlign2Plugin.h"
 
 using nvinfer1::plugin::RPROIParams;
 
@@ -181,6 +185,8 @@ extern "C"
         initializePlugin<nvinfer1::plugin::EfficientNMSONNXPluginCreator>(logger, libNamespace);
         initializePlugin<nvinfer1::plugin::EfficientNMSExplicitTFTRTPluginCreator>(logger, libNamespace);
         initializePlugin<nvinfer1::plugin::EfficientNMSImplicitTFTRTPluginCreator>(logger, libNamespace);
+        initializePlugin<nvinfer1::plugin::EfficientNMSCustomPluginCreator>(logger, libNamespace);
+        initializePlugin<nvinfer1::plugin::EfficientNMSLandmarkPluginCreator>(logger, libNamespace);
         initializePlugin<nvinfer1::plugin::FlattenConcatPluginCreator>(logger, libNamespace);
         initializePlugin<nvinfer1::plugin::GenerateDetectionPluginCreator>(logger, libNamespace);
         initializePlugin<nvinfer1::plugin::GridAnchorPluginCreator>(logger, libNamespace);
@@ -207,6 +213,8 @@ extern "C"
         initializePlugin<nvinfer1::plugin::SpecialSlicePluginCreator>(logger, libNamespace);
         initializePlugin<nvinfer1::plugin::SplitPluginCreator>(logger, libNamespace);
         initializePlugin<nvinfer1::plugin::VoxelGeneratorPluginCreator>(logger, libNamespace);
+        initializePlugin<nvinfer1::plugin::RoIAlignDynamicPluginCreator>(logger, libNamespace);
+        initializePlugin<nvinfer1::plugin::RoIAlign2DynamicPluginCreator>(logger, libNamespace);
         return true;
     }
 } // extern "C"
diff --git a/plugin/common/kernels/maskRCNNKernels.cu b/plugin/common/kernels/maskRCNNKernels.cu
index d03be3d1..0022f512 100644
--- a/plugin/common/kernels/maskRCNNKernels.cu
+++ b/plugin/common/kernels/maskRCNNKernels.cu
@@ -61,6 +61,18 @@ inline __device__ __half add_fb(const __half & a, const half & b) {
     #endif
 }
 
+inline __device__ __half div_fb(const __half & a, const __half & b) {
+#if __CUDA_ARCH__ >= 530
+    return a / b;
+#else
+    return __float2half(__half2float(a) / __half2float(b));
+#endif
+}
+
+inline __device__ float div_fb(const float & a, const float & b) {
+    return a / b;
+}
+
 template <typename DType>
 __global__ void argMaxReset_kernel(
     int samples, int NClass, const DType* in_scores, const int* maxIdx, DType* out_scores)
@@ -2644,6 +2656,144 @@ cudaError_t roiAlign(cudaStream_t const stream, int32_t const batchSize, xy_t co
     return cudaGetLastError();
 }
 
+template <typename Trois, typename Tfeat>
+__global__ void roiAlign_kernel(int const imageSize, int const featureCount, int const roiCount,
+    int const transformCoords, bool const absCoords, bool const swapCoords,
+    int const samplingRatio, Trois const* rois, Tfeat const* P2, xy_t const P2dims,
+    Tfeat* pooled, int const poolDims)
+{
+    int const batch = blockIdx.x;
+    int const feature = blockIdx.y;
+    int const roiIdx = blockIdx.z;
+
+    Trois const* roi = rois + 4 * (batch * roiCount + roiIdx);
+    float y1, x1, y2, x2;
+    if (swapCoords)
+    {
+        y1 = min(roi[0], roi[2]);
+        x1 = min(roi[1], roi[3]);
+        y2 = max(roi[0], roi[2]);
+        x2 = max(roi[1], roi[3]);
+    }
+    else
+    {
+        x1 = min(roi[0], roi[2]);
+        y1 = min(roi[1], roi[3]);
+        x2 = max(roi[0], roi[2]);
+        y2 = max(roi[1], roi[3]);
+    }
+    if (absCoords)
+    {
+        y1 = max(0.F, min(static_cast<float>(imageSize), y1)) / imageSize;
+        x1 = max(0.F, min(static_cast<float>(imageSize), x1)) / imageSize;
+        y2 = max(0.F, min(static_cast<float>(imageSize), y2)) / imageSize;
+        x2 = max(0.F, min(static_cast<float>(imageSize), x2)) / imageSize;
+    }
+    else
+    {
+        y1 = max(0.F, min(1.F, y1));
+        x1 = max(0.F, min(1.F, x1));
+        y2 = max(0.F, min(1.F, y2));
+        x2 = max(0.F, min(1.F, x2));
+    }
+
+    Tfeat const* src = P2;
+    xy_t srcDims = P2dims;
+
+    src += srcDims.x * srcDims.y * (batch * featureCount + feature);
+
+    Tfeat* dst = pooled + poolDims * poolDims * (batch * roiCount * featureCount + roiIdx * featureCount + feature);
+
+    float yStart, xStart, yEnd, xEnd, yDelta, xDelta;
+    float samplingOffset;
+    if (transformCoords == -1)
+    {
+        // Back-Compatibility with old PyramidROIAlign implementation.
+        samplingOffset = 0.F;
+
+        yStart = y1 * (srcDims.y - 1);
+        xStart = x1 * (srcDims.x - 1);
+
+        yEnd = y2 * (srcDims.y - 1);
+        xEnd = x2 * (srcDims.x - 1);
+
+        yDelta = (yEnd - yStart) / (poolDims - 1);
+        xDelta = (xEnd - xStart) / (poolDims - 1);
+    }
+    else
+    {
+        float inputOffset;
+        if (transformCoords == 0) // No Half Pixel
+        {
+            inputOffset = 0.F;
+            samplingOffset = 0.F;
+        }
+        if (transformCoords == 1) // Output Half Pixel
+        {
+            inputOffset = 0.F;
+            samplingOffset = 0.5F;
+        }
+        if (transformCoords == 2) // Half Pixel
+        {
+            inputOffset = 0.5F;
+            samplingOffset = 0.5F;
+        }
+
+        yStart = y1 * srcDims.y - inputOffset;
+        xStart = x1 * srcDims.x - inputOffset;
+
+        yEnd = y2 * srcDims.y - inputOffset;
+        xEnd = x2 * srcDims.x - inputOffset;
+
+        yDelta = (yEnd - yStart) / poolDims;
+        xDelta = (xEnd - xStart) / poolDims;
+    }
+
+    int const samplingRatioX
+        = samplingRatio > 0 ? samplingRatio : max(1, static_cast<int>(ceilf((xEnd - xStart) / poolDims)));
+    int const samplingRatioY
+        = samplingRatio > 0 ? samplingRatio : max(1, static_cast<int>(ceilf((yEnd - yStart) / poolDims)));
+    int const samplingCount = samplingRatioX * samplingRatioY;
+
+    for (int outIdx = threadIdx.x; outIdx < poolDims * poolDims; outIdx += blockDim.x)
+    {
+        int xx = outIdx % poolDims;
+        int yy = outIdx / poolDims;
+        Tfeat* out = dst + poolDims * yy + xx;
+        Tfeat result = 0;
+        for (int iy = 0; iy < samplingRatioY; iy++)
+        {
+            float ySample = yStart + yDelta * yy;
+            ySample += yDelta * (iy + samplingOffset) / samplingRatioY;
+            ySample = min(max(ySample, 0.F), srcDims.y - 1.F);
+
+            for (int ix = 0; ix < samplingRatioX; ix++)
+            {
+                float xSample = xStart + xDelta * xx;
+                xSample += xDelta * (ix + samplingOffset) / samplingRatioX;
+                xSample = min(max(xSample, 0.F), srcDims.x - 1.F);
+
+                result += interpolateBilinear(src, srcDims, ySample, xSample);
+            }
+        }
+        *out = result / samplingCount;
+    }
+}
+
+cudaError_t roiAlign(cudaStream_t const stream, int const batchSize, int const imageSize,
+    int const featureCount, int const roiCount, int const transformCoords,
+    bool const absCoords, bool const swapCoords, int const samplingRatio,
+    void const* rois, void const* layers, xy_t const layerDims, void* pooled, int const poolDims)
+{
+    const dim3 blocks(batchSize, featureCount, roiCount);
+    int const threads(min(256, poolDims * poolDims));
+
+    roiAlign_kernel<<<blocks, threads, 0, stream>>>(imageSize, featureCount, roiCount, transformCoords,
+        absCoords, swapCoords, samplingRatio, static_cast<const float*>(rois), static_cast<const float*>(layers), layerDims, static_cast<float*>(pooled), poolDims);
+
+    return cudaGetLastError();
+}
+
 template <typename Trois, typename Tfeat>
 __global__ void roiAlignHalfCenter_kernel(int featureCount, int roiCount,
 
diff --git a/plugin/common/kernels/maskRCNNKernels.h b/plugin/common/kernels/maskRCNNKernels.h
index 23ea4487..6e536916 100644
--- a/plugin/common/kernels/maskRCNNKernels.h
+++ b/plugin/common/kernels/maskRCNNKernels.h
@@ -275,6 +275,11 @@ cudaError_t roiAlign(cudaStream_t const stream, int32_t const batchSize, xy_t co
     bool const absCoords, bool const swapCoords, bool const plusOneCoords, int32_t const samplingRatio,
     void const* rois, void const* const layers[], xy_t const* layerDims, void* pooled, xy_t const poolDims);
 
+cudaError_t roiAlign(cudaStream_t const stream, int const batchSize, int const imageSize,
+    int const featureCount, int const roiCount, int const transformCoords,
+    bool const absCoords, bool const swapCoords, int const samplingRatio,
+    void const* rois, void const* layers, xy_t const layerDims, void* pooled, int const poolDims);
+
 cudaError_t roiAlignHalfCenter(cudaStream_t stream, int batchSize, int featureCount, int roiCount, float firstThreshold,
 
     int inputHeight, int inputWidth, const void* rois, const void* const layers[], const xy_t* layerDims,
diff --git a/plugin/efficientNMSCustomPlugin/CMakeLists.txt b/plugin/efficientNMSCustomPlugin/CMakeLists.txt
new file mode 100644
index 00000000..53b70a7e
--- /dev/null
+++ b/plugin/efficientNMSCustomPlugin/CMakeLists.txt
@@ -0,0 +1,21 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB SRCS *.cpp)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
+file(GLOB CU_SRCS *.cu)
+set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS})
+set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} PARENT_SCOPE)
diff --git a/plugin/efficientNMSCustomPlugin/README.md b/plugin/efficientNMSCustomPlugin/README.md
new file mode 100644
index 00000000..1728fc06
--- /dev/null
+++ b/plugin/efficientNMSCustomPlugin/README.md
@@ -0,0 +1,162 @@
+# Efficient NMS Custom Plugin
+
+#### Table of Contents
+- [Description](#description)
+- [Structure](#structure)
+  * [Inputs](#inputs)
+  * [Dynamic Shape Support](#dynamic-shape-support)
+  * [Box Coding Type](#box-coding-type)
+  * [Outputs](#outputs)
+  * [Parameters](#parameters)
+- [Algorithm](#algorithm)
+  * [Process Description](#process-description)
+  * [Performance Tuning](#performance-tuning)
+  * [Additional Resources](#additional-resources)
+- [License](#license)
+
+## Description
+
+This TensorRT plugin implements an efficient algorithm to perform Non Maximum Suppression for object detection networks.
+
+This plugin is primarily intended for using with EfficientDet on TensorRT, as this network is particularly sensitive to the latencies introduced by slower NMS implementations. However, the plugin is generic enough that it will work correctly for other detections architectures, such as SSD or FasterRCNN.
+
+## Structure
+
+### Inputs
+
+The plugin has two modes of operation, depending on the given input data. The plugin will automatically detect which mode to operate as, depending on the number of inputs it receives, as follows:
+
+1. **Standard NMS Mode:** Only two input tensors are given, (i) the bounding box coordinates and (ii) the corresponding classification scores for each box.
+
+2. **Fused Box Decoder Mode:** Three input tensors are given, (i) the raw localization predictions for each box originating directly from the localization head of the network, (ii) the corresponding classification scores originating from the classification head of the network, and (iii) the default anchor box coordinates usually hardcoded as constant tensors in the network.
+
+Most object detection networks work by generating raw predictions from a "localization head" which adjust the coordinates of standard non-learned anchor coordinates to produce a tighter fitting bounding box. This process is called "box decoding", and it usually involves a large number of element-wise operations to transform the anchors to final box coordinates. As this can involve exponential operations on a large number of anchors, it can be computationally expensive, so this plugin gives the option of fusing the box decoder within the NMS operation which can be done in a far more efficient manner, resulting in lower latency for the network.
+
+#### Boxes Input
+> **Input Shape:** `[batch_size, number_boxes, 4]` or `[batch_size, number_boxes, number_classes, 4]`
+>
+> **Data Type:** `float32` or `float16`
+
+The boxes input can have 3 dimensions in case a single box prediction is produced for all classes (such as in EfficientDet or SSD), or 4 dimensions when separate box predictions are generated for each class (such as in FasterRCNN), in which case `number_classes` >= 1 and must match the number of classes in the scores input. The final dimension represents the four coordinates that define the bounding box prediction.
+
+For *Standard NMS* mode, this tensor should contain the final box coordinates for each predicted detection. For *Fused Box Decoder* mode, this tensor should have the raw localization predictions. In either case, this data is given as `4` coordinates which makes up the final shape dimension.
+
+#### Scores Input
+> **Input Shape:** `[batch_size, number_boxes, number_classes]`
+>
+> **Data Type:** `float32` or `float16`
+
+The scores input has `number_classes` elements with the predicted scores for each candidate class for each of the `number_boxes` anchor boxes.
+
+Usually, the score values will have passed through a sigmoid activation function before reaching the NMS operation. However, as an optimization, the pre-sigmoid raw scores can also be provided to the NMS plugin to reduce overall network latency. If raw scores are given, enable the `score_activation` parameter so they are processed accordingly.
+
+#### Anchors Input (Optional)
+> **Input Shape:** `[1, number_boxes, 4]` or `[batch_size, number_boxes, 4]`
+>
+> **Data Type:** `float32` or `float16`
+
+Only used in *Fused Box Decoder* mode. It is much more efficient to perform the box decoding within this plugin. In this case, the boxes input will be treated as the raw localization head box corrections, and this third input should contain the default anchor/prior box coordinates.
+
+When used, the input must have 3 dimensions, where the first one may be either `1` in case anchors are constant for all images in a batch, or `batch_size` in case each image has different anchors -- such as in the box refinement NMS of FasterRCNN's second stage.
+
+### Dynamic Shape Support
+
+Most input shape dimensions, namely `batch_size`, `number_boxes`, and `number_classes`, for all inputs can be defined dynamically at runtime if the TensorRT engine is built with dynamic input shapes. However, once defined, these dimensions must match across all tensors that use them (e.g. the same `number_boxes` dimension must be given for both boxes and scores, etc.)
+
+### Box Coding Type
+Different object detection networks represent their box coordinate system differently. The two types supported by this plugin are:
+
+1. **BoxCorners:** The four coordinates represent `[x1, y1, x2, y2]` values, where each x,y pair defines the top-left and bottom-right corners of a bounding box.
+2. **BoxCenterSize:** The four coordinates represent `[x, y, w, h]` values, where the x,y pair define the box center location, and the w,h pair define its width and height.
+
+Note that for NMS purposes, horizontal and vertical coordinates are fully interchangeable. TensorFlow-trained networks, for example, often uses vertical-first coordinates such as `[y1, x1, y2, x2]`, but this coordinate system will work equally well under the BoxCorner coding. Similarly, `[y, x, h, w]` will be properly covered by the BoxCornerSize coding.
+
+In *Fused Box Decoder* mode, the boxes and anchor tensors should both use the same coding.
+
+### Outputs
+
+The following four output tensors are generated:
+
+- **num_detections:**
+  This is a `[batch_size, 1]` tensor of data type `int32`. The last dimension is a scalar indicating the number of valid detections per batch image. It can be less than `max_output_boxes`. Only the top `num_detections[i]` entries in `nms_boxes[i]`, `nms_scores[i]` and `nms_classes[i]` are valid.
+
+- **detection_boxes:**
+  This is a `[batch_size, max_output_boxes, 4]` tensor of data type `float32` or `float16`, containing the coordinates of non-max suppressed boxes. The output coordinates will always be in BoxCorner format, regardless of the input code type.
+
+- **detection_scores:**
+  This is a `[batch_size, max_output_boxes]` tensor of data type `float32` or `float16`, containing the scores for the boxes.
+
+- **detection_classes:**
+  This is a `[batch_size, max_output_boxes]` tensor of data type `int32`, containing the classes for the boxes.
+
+- **detection_indices:**
+  This is a `[batch_size, max_output_boxes]` tensor of data type `int32`, containing the indices for the boxes.
+
+### Parameters
+
+| Type     | Parameter                | Description
+|----------|--------------------------|--------------------------------------------------------
+|`float`   |`score_threshold` *       |The scalar threshold for score (low scoring boxes are removed).
+|`float`   |`iou_threshold`           |The scalar threshold for IOU (additional boxes that have high IOU overlap with previously selected boxes are removed).
+|`int`     |`max_output_boxes`        |The maximum number of detections to output per image.
+|`int`     |`background_class`        |The label ID for the background class. If there is no background class, set it to `-1`.
+|`bool`    |`score_activation` *      |Set to true to apply sigmoid activation to the confidence scores during NMS operation.
+|`int`     |`box_coding`              |Coding type used for boxes (and anchors if applicable), 0 = BoxCorner, 1 = BoxCenterSize.
+
+Parameters marked with a `*` have a non-negligible effect on runtime latency. See the [Performance Tuning](#performance-tuning) section below for more details on how to set them optimally.
+
+## Algorithm
+
+### Process Description
+
+The NMS algorithm in this plugin first filters the scores below the given `scoreThreshold`. This subset of scores is then sorted, and their corresponding boxes are then further filtered out by removing boxes that overlap each other with an IOU above the given `iouThreshold`.
+
+The algorithm launcher and its relevant CUDA kernels are all defined in the `efficientNMSCustomInference.cu` file.
+
+Specifically, the NMS algorithm does the following:
+
+- The scores are filtered with the `score_threshold` parameter to reject any scores below the score threshold, while maintaining indexing to cross-reference these scores to their corresponding box coordinates. This is done with the `EfficientNMSCustomFilter` CUDA kernel.
+
+- If too many elements are kept, due to a very low (or zero) score threshold, the filter operation can become a bottleneck due to the atomic operations involved. To mitigate this, a fallback kernel `EfficientNMSCustomDenseIndex` is used instead which passes all the score elements densely packed and indexed. This method is heuristically selected only if the score threshold is less than 0.007.
+
+- The selected scores that remain after filtering are sorted in descending order. The indexing is carefully handled to still maintain score to box relationships after sorting.
+
+- After sorting, the highest 4096 scores are processed by the `EfficientNMSCustom` CUDA kernel. This algorithm uses the index data maintained throughout the previous steps to find the boxes corresponding to the remaining scores. If the fused box decoder is being used, decoding will happen until this stage, where only the top scoring boxes need to be decoded.
+
+- The NMS kernel uses an efficient filtering algorithm that largely reduces the number of IOU overlap cross-checks between box pairs. The boxes that survive the IOU filtering finally pass through to the output results. At this stage, the sigmoid activation is applied to only the final remaining scores, if `score_activation` is enabled, thereby greatly reducing the amount of sigmoid calculations required otherwise.
+
+### Performance Tuning
+
+The plugin implements a very efficient NMS algorithm which largely reduces the latency of this operation in comparison to other NMS plugins. However, there are certain considerations that can help to better fine tune its performance:
+
+#### Choosing the Score Threshold
+
+The algorithm is highly sensitive to the selected `score_threshold` parameter. With a higher threshold, fewer elements need to be processed and so the algorithm runs much faster. Therefore, it's beneficial to always select the highest possible score threshold that fulfills the application requirements. Threshold values lower than approximately 0.01 may cause substantially higher latency.
+
+#### Using Sigmoid Activation
+
+Depending on network configuration, it is usually more efficient to provide raw scores (pre-sigmoid) to the NMS plugin scores input, and enable the `score_activation` parameter. Doing so applies a sigmoid activation only to the last `max_output_boxes` selected scores, instead of all the predicted scores, largely reducing the computational cost.
+
+#### Using the Fused Box Decoder
+
+When using networks with many anchors, such as EfficientDet or SSD, it may be more efficient to do box decoding within the NMS plugin. For this, pass the raw box predictions as the boxes input, and the default anchor coordinates as the optional third input to the plugin.
+
+### Additional Resources
+
+The following resources provide a deeper understanding of the NMS algorithm:
+
+#### Networks
+- [EfficientDet](https://arxiv.org/abs/1911.09070)
+- [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325)
+- [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497)
+- [Mask R-CNN](https://arxiv.org/abs/1703.06870)
+
+
+#### Documentation
+- [NMS algorithm](https://www.coursera.org/lecture/convolutional-neural-networks/non-max-suppression-dvrjH)
+- [NonMaxSuppression ONNX Op](https://github.com/onnx/onnx/blob/master/docs/Operators.md#NonMaxSuppression)
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html)
+documentation.
diff --git a/plugin/efficientNMSCustomPlugin/efficientNMSCustomInference.cu b/plugin/efficientNMSCustomPlugin/efficientNMSCustomInference.cu
new file mode 100644
index 00000000..5952daa3
--- /dev/null
+++ b/plugin/efficientNMSCustomPlugin/efficientNMSCustomInference.cu
@@ -0,0 +1,691 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common/bboxUtils.h"
+#include "cub/cub.cuh"
+#include "cuda_runtime_api.h"
+
+#include "efficientNMSCustomInference.cuh"
+#include "efficientNMSCustomInference.h"
+
+#define NMS_TILES 5
+
+using namespace nvinfer1;
+
+template <typename T>
+__device__ float IOU(EfficientNMSCustomParameters param, BoxCorner<T> box1, BoxCorner<T> box2)
+{
+    // Regardless of the selected box coding, IOU is always performed in BoxCorner coding.
+    // The boxes are copied so that they can be reordered without affecting the originals.
+    BoxCorner<T> b1 = box1;
+    BoxCorner<T> b2 = box2;
+    b1.reorder();
+    b2.reorder();
+    float intersectArea = BoxCorner<T>::intersect(b1, b2).area();
+    if (intersectArea <= 0.f)
+    {
+        return 0.f;
+    }
+    float unionArea = b1.area() + b2.area() - intersectArea;
+    if (unionArea <= 0.f)
+    {
+        return 0.f;
+    }
+    return intersectArea / unionArea;
+}
+
+template <typename T, typename Tb>
+__device__ BoxCorner<T> DecodeBoxes(EfficientNMSCustomParameters param, int boxIdx, int anchorIdx,
+    const Tb* __restrict__ boxesInput, const Tb* __restrict__ anchorsInput)
+{
+    // The inputs will be in the selected coding format, as well as the decoding function. But the decoded box
+    // will always be returned as BoxCorner.
+    Tb box = boxesInput[boxIdx];
+    if (!param.boxDecoder)
+    {
+        return BoxCorner<T>(box);
+    }
+    Tb anchor = anchorsInput[anchorIdx];
+    box.reorder();
+    anchor.reorder();
+    return BoxCorner<T>(box.decode(anchor));
+}
+
+template <typename T, typename Tb>
+__device__ void MapNMSData(EfficientNMSCustomParameters param, int idx, int imageIdx, const Tb* __restrict__ boxesInput,
+    const Tb* __restrict__ anchorsInput, const int* __restrict__ topClassData, const int* __restrict__ topAnchorsData,
+    const int* __restrict__ topNumData, const T* __restrict__ sortedScoresData, const int* __restrict__ sortedIndexData,
+    T& scoreMap, int& classMap, BoxCorner<T>& boxMap, int& boxIdxMap)
+{
+    // idx: Holds the NMS box index, within the current batch.
+    // idxSort: Holds the batched NMS box index, which indexes the (filtered, but sorted) score buffer.
+    // scoreMap: Holds the score that corresponds to the indexed box being processed by NMS.
+    if (idx >= topNumData[imageIdx])
+    {
+        return;
+    }
+    int idxSort = imageIdx * param.numScoreElements + idx;
+    scoreMap = sortedScoresData[idxSort];
+
+    // idxMap: Holds the re-mapped index, which indexes the (filtered, but unsorted) buffers.
+    // classMap: Holds the class that corresponds to the idx'th sorted score being processed by NMS.
+    // anchorMap: Holds the anchor that corresponds to the idx'th sorted score being processed by NMS.
+    int idxMap = imageIdx * param.numScoreElements + sortedIndexData[idxSort];
+    classMap = topClassData[idxMap];
+    int anchorMap = topAnchorsData[idxMap];
+
+    // boxIdxMap: Holds the re-re-mapped index, which indexes the (unfiltered, and unsorted) boxes input buffer.
+    boxIdxMap = -1;
+    if (param.shareLocation) // Shape of boxesInput: [batchSize, numAnchors, 1, 4]
+    {
+        boxIdxMap = imageIdx * param.numAnchors + anchorMap;
+    }
+    else // Shape of boxesInput: [batchSize, numAnchors, numClasses, 4]
+    {
+        int batchOffset = imageIdx * param.numAnchors * param.numClasses;
+        int anchorOffset = anchorMap * param.numClasses;
+        boxIdxMap = batchOffset + anchorOffset + classMap;
+    }
+    // anchorIdxMap: Holds the re-re-mapped index, which indexes the (unfiltered, and unsorted) anchors input buffer.
+    int anchorIdxMap = -1;
+    if (param.shareAnchors) // Shape of anchorsInput: [1, numAnchors, 4]
+    {
+        anchorIdxMap = anchorMap;
+    }
+    else // Shape of anchorsInput: [batchSize, numAnchors, 4]
+    {
+        anchorIdxMap = imageIdx * param.numAnchors + anchorMap;
+    }
+    // boxMap: Holds the box that corresponds to the idx'th sorted score being processed by NMS.
+    boxMap = DecodeBoxes<T, Tb>(param, boxIdxMap, anchorIdxMap, boxesInput, anchorsInput);
+}
+
+template <typename T>
+__device__ void WriteNMSResult(EfficientNMSCustomParameters param, int* __restrict__ numDetectionsOutput,
+    T* __restrict__ nmsScoresOutput, int* __restrict__ nmsClassesOutput, BoxCorner<T>* __restrict__ nmsBoxesOutput,
+    int* __restrict__ nmsIndicesOutput, T threadScore, int threadClass, BoxCorner<T> threadBox, int imageIdx,
+    unsigned int resultsCounter, int boxIdxMap)
+{
+    int outputIdx = imageIdx * param.numOutputBoxes + resultsCounter - 1;
+    if (param.scoreSigmoid)
+    {
+        nmsScoresOutput[outputIdx] = sigmoid_mp(threadScore);
+    }
+    else if (param.scoreBits > 0)
+    {
+        nmsScoresOutput[outputIdx] = add_mp(threadScore, (T) -1);
+    }
+    else
+    {
+        nmsScoresOutput[outputIdx] = threadScore;
+    }
+    nmsClassesOutput[outputIdx] = threadClass;
+    if (param.clipBoxes)
+    {
+        nmsBoxesOutput[outputIdx] = threadBox.clip((T) 0, (T) 1);
+    }
+    else
+    {
+        nmsBoxesOutput[outputIdx] = threadBox;
+    }
+    numDetectionsOutput[imageIdx] = resultsCounter;
+
+    int index = boxIdxMap % param.numAnchors;
+
+    nmsIndicesOutput[outputIdx] = index;
+}
+
+template <typename T, typename Tb>
+__global__ void EfficientNMSCustom(EfficientNMSCustomParameters param, const int* topNumData, int* outputIndexData,
+    int* outputClassData, const int* sortedIndexData, const T* __restrict__ sortedScoresData,
+    const int* __restrict__ topClassData, const int* __restrict__ topAnchorsData, const Tb* __restrict__ boxesInput,
+    const Tb* __restrict__ anchorsInput, int* __restrict__ numDetectionsOutput, T* __restrict__ nmsScoresOutput,
+    int* __restrict__ nmsClassesOutput, int* __restrict__ nmsIndicesOutput, BoxCorner<T>* __restrict__ nmsBoxesOutput)
+{
+    unsigned int thread = threadIdx.x;
+    unsigned int imageIdx = blockIdx.y;
+    unsigned int tileSize = blockDim.x;
+    if (imageIdx >= param.batchSize)
+    {
+        return;
+    }
+
+    int numSelectedBoxes = min(topNumData[imageIdx], param.numSelectedBoxes);
+    int numTiles = (numSelectedBoxes + tileSize - 1) / tileSize;
+    if (thread >= numSelectedBoxes)
+    {
+        return;
+    }
+
+    __shared__ int blockState;
+    __shared__ unsigned int resultsCounter;
+    if (thread == 0)
+    {
+        blockState = 0;
+        resultsCounter = 0;
+    }
+
+    int threadState[NMS_TILES];
+    unsigned int boxIdx[NMS_TILES];
+    T threadScore[NMS_TILES];
+    int threadClass[NMS_TILES];
+    BoxCorner<T> threadBox[NMS_TILES];
+    int boxIdxMap[NMS_TILES];
+    for (int tile = 0; tile < numTiles; tile++)
+    {
+        threadState[tile] = 0;
+        boxIdx[tile] = thread + tile * blockDim.x;
+        MapNMSData<T, Tb>(param, boxIdx[tile], imageIdx, boxesInput, anchorsInput, topClassData, topAnchorsData,
+            topNumData, sortedScoresData, sortedIndexData, threadScore[tile], threadClass[tile], threadBox[tile],
+            boxIdxMap[tile]);
+    }
+
+    // Iterate through all boxes to NMS against.
+    for (int i = 0; i < numSelectedBoxes; i++)
+    {
+        int tile = i / tileSize;
+
+        if (boxIdx[tile] == i)
+        {
+            // Iteration lead thread, figure out what the other threads should do,
+            // this will be signaled via the blockState shared variable.
+            if (threadState[tile] == -1)
+            {
+                // Thread already dead, this box was already dropped in a previous iteration,
+                // because it had a large IOU overlap with another lead thread previously, so
+                // it would never be kept anyway, therefore it can safely be skip all IOU operations
+                // in this iteration.
+                blockState = -1; // -1 => Signal all threads to skip iteration
+            }
+            else if (threadState[tile] == 0)
+            {
+                // As this box will be kept, this is a good place to find what index in the results buffer it
+                // should have, as this allows to perform an early loop exit if there are enough results.
+                if (resultsCounter >= param.numOutputBoxes)
+                {
+                    blockState = -2; // -2 => Signal all threads to do an early loop exit.
+                }
+                else
+                {
+                    // Thread is still alive, because it has not had a large enough IOU overlap with
+                    // any other kept box previously. Therefore, this box will be kept for sure. However,
+                    // we need to check against all other subsequent boxes from this position onward,
+                    // to see how those other boxes will behave in future iterations.
+                    blockState = 1;        // +1 => Signal all (higher index) threads to calculate IOU against this box
+                    threadState[tile] = 1; // +1 => Mark this box's thread to be kept and written out to results
+
+                    // If the numOutputBoxesPerClass check is enabled, write the result only if the limit for this
+                    // class on this image has not been reached yet. Other than (possibly) skipping the write, this
+                    // won't affect anything else in the NMS threading.
+                    bool write = true;
+                    if (param.numOutputBoxesPerClass >= 0)
+                    {
+                        int classCounterIdx = imageIdx * param.numClasses + threadClass[tile];
+                        write = (outputClassData[classCounterIdx] < param.numOutputBoxesPerClass);
+                        outputClassData[classCounterIdx]++;
+                    }
+                    if (write)
+                    {
+                        // This branch is visited by one thread per iteration, so it's safe to do non-atomic increments.
+                        resultsCounter++;
+
+                        WriteNMSResult<T>(param, numDetectionsOutput, nmsScoresOutput, nmsClassesOutput, nmsBoxesOutput,
+                            nmsIndicesOutput, threadScore[tile], threadClass[tile], threadBox[tile], imageIdx,
+                            resultsCounter, boxIdxMap[tile]);
+                    }
+                }
+            }
+            else
+            {
+                // This state should never be reached, but just in case...
+                blockState = 0; // 0 => Signal all threads to not do any updates, nothing happens.
+            }
+        }
+
+        __syncthreads();
+
+        if (blockState == -2)
+        {
+            // This is the signal to exit from the loop.
+            return;
+        }
+
+        if (blockState == -1)
+        {
+            // This is the signal for all threads to just skip this iteration, as no IOU's need to be checked.
+            continue;
+        }
+
+        // Grab a box and class to test the current box against. The test box corresponds to iteration i,
+        // therefore it will have a lower index than the current thread box, and will therefore have a higher score
+        // than the current box because it's located "before" in the sorted score list.
+        T testScore;
+        int testClass;
+        BoxCorner<T> testBox;
+        int testBoxIdxMap;
+        MapNMSData<T, Tb>(param, i, imageIdx, boxesInput, anchorsInput, topClassData, topAnchorsData, topNumData,
+            sortedScoresData, sortedIndexData, testScore, testClass, testBox, testBoxIdxMap);
+
+        for (int tile = 0; tile < numTiles; tile++)
+        {
+            // IOU
+            if (boxIdx[tile] > i && // Make sure two different boxes are being tested, and that it's a higher index;
+                boxIdx[tile] < numSelectedBoxes && // Make sure the box is within numSelectedBoxes;
+                blockState == 1 &&                 // Signal that allows IOU checks to be performed;
+                threadState[tile] == 0 &&          // Make sure this box hasn't been either dropped or kept already;
+                threadClass[tile] == testClass &&  // Compare only boxes of matching classes;
+                lte_mp(threadScore[tile], testScore) && // Make sure the sorting order of scores is as expected;
+                IOU<T>(param, threadBox[tile], testBox) >= param.iouThreshold) // And... IOU overlap.
+            {
+                // Current box overlaps with the box tested in this iteration, this box will be skipped.
+                threadState[tile] = -1; // -1 => Mark this box's thread to be dropped.
+            }
+        }
+    }
+}
+
+template <typename T>
+cudaError_t EfficientNMSCustomLauncher(EfficientNMSCustomParameters& param, int* topNumData, int* outputIndexData,
+    int* outputClassData, int* sortedIndexData, T* sortedScoresData, int* topClassData, int* topAnchorsData,
+    const void* boxesInput, const void* anchorsInput, int* numDetectionsOutput, T* nmsScoresOutput,
+    int* nmsClassesOutput, int* nmsIndicesOutput, void* nmsBoxesOutput, cudaStream_t stream)
+{
+    unsigned int tileSize = param.numSelectedBoxes / NMS_TILES;
+    if (param.numSelectedBoxes <= 512)
+    {
+        tileSize = 512;
+    }
+    if (param.numSelectedBoxes <= 256)
+    {
+        tileSize = 256;
+    }
+
+    const dim3 blockSize = {tileSize, 1, 1};
+    const dim3 gridSize = {1, (unsigned int) param.batchSize, 1};
+
+    if (param.boxCoding == 0)
+    {
+        EfficientNMSCustom<T, BoxCorner<T>><<<gridSize, blockSize, 0, stream>>>(param, topNumData, outputIndexData,
+            outputClassData, sortedIndexData, sortedScoresData, topClassData, topAnchorsData,
+            (BoxCorner<T>*) boxesInput, (BoxCorner<T>*) anchorsInput, numDetectionsOutput, nmsScoresOutput,
+            nmsClassesOutput, nmsIndicesOutput, (BoxCorner<T>*) nmsBoxesOutput);
+    }
+    else if (param.boxCoding == 1)
+    {
+        // Note that nmsBoxesOutput is always coded as BoxCorner<T>, regardless of the input coding type.
+        EfficientNMSCustom<T, BoxCenterSize<T>><<<gridSize, blockSize, 0, stream>>>(param, topNumData, outputIndexData,
+            outputClassData, sortedIndexData, sortedScoresData, topClassData, topAnchorsData,
+            (BoxCenterSize<T>*) boxesInput, (BoxCenterSize<T>*) anchorsInput, numDetectionsOutput, nmsScoresOutput,
+            nmsClassesOutput, nmsIndicesOutput, (BoxCorner<T>*) nmsBoxesOutput);
+    }
+
+    return cudaGetLastError();
+}
+
+__global__ void EfficientNMSCustomFilterSegments(EfficientNMSCustomParameters param, const int* __restrict__ topNumData,
+    int* __restrict__ topOffsetsStartData, int* __restrict__ topOffsetsEndData)
+{
+    int imageIdx = threadIdx.x;
+    if (imageIdx > param.batchSize)
+    {
+        return;
+    }
+    topOffsetsStartData[imageIdx] = imageIdx * param.numScoreElements;
+    topOffsetsEndData[imageIdx] = imageIdx * param.numScoreElements + topNumData[imageIdx];
+}
+
+template <typename T>
+__global__ void EfficientNMSCustomFilter(EfficientNMSCustomParameters param, const T* __restrict__ scoresInput,
+    int* __restrict__ topNumData, int* __restrict__ topIndexData, int* __restrict__ topAnchorsData,
+    T* __restrict__ topScoresData, int* __restrict__ topClassData)
+{
+    int elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
+    int imageIdx = blockDim.y * blockIdx.y + threadIdx.y;
+
+    // Boundary Conditions
+    if (elementIdx >= param.numScoreElements || imageIdx >= param.batchSize)
+    {
+        return;
+    }
+
+    // Shape of scoresInput: [batchSize, numAnchors, numClasses]
+    int scoresInputIdx = imageIdx * param.numScoreElements + elementIdx;
+
+    // For each class, check its corresponding score if it crosses the threshold, and if so select this anchor,
+    // and keep track of the maximum score and the corresponding (argmax) class id
+    T score = scoresInput[scoresInputIdx];
+    if (gte_mp(score, (T) param.scoreThreshold))
+    {
+        // Unpack the class and anchor index from the element index
+        int classIdx = elementIdx % param.numClasses;
+        int anchorIdx = elementIdx / param.numClasses;
+
+        // If this is a background class, ignore it.
+        if (classIdx == param.backgroundClass)
+        {
+            return;
+        }
+
+        // Use an atomic to find an open slot where to write the selected anchor data.
+        if (topNumData[imageIdx] >= param.numScoreElements)
+        {
+            return;
+        }
+        int selectedIdx = atomicAdd((unsigned int*) &topNumData[imageIdx], 1);
+        if (selectedIdx >= param.numScoreElements)
+        {
+            topNumData[imageIdx] = param.numScoreElements;
+            return;
+        }
+
+        // Shape of topScoresData / topClassData: [batchSize, numScoreElements]
+        int topIdx = imageIdx * param.numScoreElements + selectedIdx;
+
+        if (param.scoreBits > 0)
+        {
+            score = add_mp(score, (T) 1);
+            if (gt_mp(score, (T) (2.f - 1.f / 1024.f)))
+            {
+                // Ensure the incremented score fits in the mantissa without changing the exponent
+                score = (2.f - 1.f / 1024.f);
+            }
+        }
+
+        topIndexData[topIdx] = selectedIdx;
+        topAnchorsData[topIdx] = anchorIdx;
+        topScoresData[topIdx] = score;
+        topClassData[topIdx] = classIdx;
+    }
+}
+
+template <typename T>
+__global__ void EfficientNMSCustomDenseIndex(EfficientNMSCustomParameters param, int* __restrict__ topNumData,
+    int* __restrict__ topIndexData, int* __restrict__ topAnchorsData, int* __restrict__ topOffsetsStartData,
+    int* __restrict__ topOffsetsEndData, T* __restrict__ topScoresData, int* __restrict__ topClassData)
+{
+    int elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
+    int imageIdx = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (elementIdx >= param.numScoreElements || imageIdx >= param.batchSize)
+    {
+        return;
+    }
+
+    int dataIdx = imageIdx * param.numScoreElements + elementIdx;
+    int anchorIdx = elementIdx / param.numClasses;
+    int classIdx = elementIdx % param.numClasses;
+    if (param.scoreBits > 0)
+    {
+        T score = topScoresData[dataIdx];
+        if (lt_mp(score, (T) param.scoreThreshold))
+        {
+            score = (T) 1;
+        }
+        else if (classIdx == param.backgroundClass)
+        {
+            score = (T) 1;
+        }
+        else
+        {
+            score = add_mp(score, (T) 1);
+            if (gt_mp(score, (T) (2.f - 1.f / 1024.f)))
+            {
+                // Ensure the incremented score fits in the mantissa without changing the exponent
+                score = (2.f - 1.f / 1024.f);
+            }
+        }
+        topScoresData[dataIdx] = score;
+    }
+    else
+    {
+        T score = topScoresData[dataIdx];
+        if (lt_mp(score, (T) param.scoreThreshold))
+        {
+            topScoresData[dataIdx] = -(1 << 15);
+        }
+        else if (classIdx == param.backgroundClass)
+        {
+            topScoresData[dataIdx] = -(1 << 15);
+        }
+    }
+
+    topIndexData[dataIdx] = elementIdx;
+    topAnchorsData[dataIdx] = anchorIdx;
+    topClassData[dataIdx] = classIdx;
+
+    if (elementIdx == 0)
+    {
+        // Saturate counters
+        topNumData[imageIdx] = param.numScoreElements;
+        topOffsetsStartData[imageIdx] = imageIdx * param.numScoreElements;
+        topOffsetsEndData[imageIdx] = (imageIdx + 1) * param.numScoreElements;
+    }
+}
+
+template <typename T>
+cudaError_t EfficientNMSCustomFilterLauncher(EfficientNMSCustomParameters& param, const T* scoresInput, int* topNumData,
+    int* topIndexData, int* topAnchorsData, int* topOffsetsStartData, int* topOffsetsEndData, T* topScoresData,
+    int* topClassData, cudaStream_t stream)
+{
+    const unsigned int elementsPerBlock = 512;
+    const unsigned int imagesPerBlock = 1;
+    const unsigned int elementBlocks = (param.numScoreElements + elementsPerBlock - 1) / elementsPerBlock;
+    const unsigned int imageBlocks = (param.batchSize + imagesPerBlock - 1) / imagesPerBlock;
+    const dim3 blockSize = {elementsPerBlock, imagesPerBlock, 1};
+    const dim3 gridSize = {elementBlocks, imageBlocks, 1};
+
+    float kernelSelectThreshold = 0.007f;
+    if (param.scoreSigmoid)
+    {
+        // Inverse Sigmoid
+        if (param.scoreThreshold <= 0.f)
+        {
+            param.scoreThreshold = -(1 << 15);
+        }
+        else
+        {
+            param.scoreThreshold = logf(param.scoreThreshold / (1.f - param.scoreThreshold));
+        }
+        kernelSelectThreshold = logf(kernelSelectThreshold / (1.f - kernelSelectThreshold));
+        // Disable Score Bits Optimization
+        param.scoreBits = -1;
+    }
+
+    if (param.scoreThreshold < kernelSelectThreshold)
+    {
+        // A full copy of the buffer is necessary because sorting will scramble the input data otherwise.
+        PLUGIN_CHECK_CUDA(cudaMemcpyAsync(topScoresData, scoresInput, param.batchSize * param.numScoreElements * sizeof(T),
+            cudaMemcpyDeviceToDevice, stream));
+
+        EfficientNMSCustomDenseIndex<T><<<gridSize, blockSize, 0, stream>>>(param, topNumData, topIndexData, topAnchorsData,
+            topOffsetsStartData, topOffsetsEndData, topScoresData, topClassData);
+    }
+    else
+    {
+        EfficientNMSCustomFilter<T><<<gridSize, blockSize, 0, stream>>>(
+            param, scoresInput, topNumData, topIndexData, topAnchorsData, topScoresData, topClassData);
+
+        EfficientNMSCustomFilterSegments<<<1, param.batchSize, 0, stream>>>(
+            param, topNumData, topOffsetsStartData, topOffsetsEndData);
+    }
+
+    return cudaGetLastError();
+}
+
+template <typename T>
+size_t EfficientNMSCustomSortWorkspaceSize(int batchSize, int numScoreElements)
+{
+    size_t sortedWorkspaceSize = 0;
+    cub::DoubleBuffer<T> keysDB(nullptr, nullptr);
+    cub::DoubleBuffer<int> valuesDB(nullptr, nullptr);
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, sortedWorkspaceSize, keysDB, valuesDB, numScoreElements,
+        batchSize, (const int*) nullptr, (const int*) nullptr);
+    return sortedWorkspaceSize;
+}
+
+size_t EfficientNMSCustomWorkspaceSize(int batchSize, int numScoreElements, int numClasses, DataType datatype)
+{
+    size_t total = 0;
+    const size_t align = 256;
+    // Counters
+    // 3 for Filtering
+    // 1 for Output Indexing
+    // C for Max per Class Limiting
+    size_t size = (3 + 1 + numClasses) * batchSize * sizeof(int);
+    total += size + (size % align ? align - (size % align) : 0);
+    // Int Buffers
+    for (int i = 0; i < 4; i++)
+    {
+        size = batchSize * numScoreElements * sizeof(int);
+        total += size + (size % align ? align - (size % align) : 0);
+    }
+    // Float Buffers
+    for (int i = 0; i < 2; i++)
+    {
+        size = batchSize * numScoreElements * dataTypeSize(datatype);
+        total += size + (size % align ? align - (size % align) : 0);
+    }
+    // Sort Workspace
+    if (datatype == DataType::kHALF)
+    {
+        size = EfficientNMSCustomSortWorkspaceSize<__half>(batchSize, numScoreElements);
+        total += size + (size % align ? align - (size % align) : 0);
+    }
+    else if (datatype == DataType::kFLOAT)
+    {
+        size = EfficientNMSCustomSortWorkspaceSize<float>(batchSize, numScoreElements);
+        total += size + (size % align ? align - (size % align) : 0);
+    }
+
+    return total;
+}
+
+template <typename T>
+T* EfficientNMSCustomWorkspace(void* workspace, size_t& offset, size_t elements)
+{
+    T* buffer = (T*) ((size_t) workspace + offset);
+    size_t align = 256;
+    size_t size = elements * sizeof(T);
+    size_t sizeAligned = size + (size % align ? align - (size % align) : 0);
+    offset += sizeAligned;
+    return buffer;
+}
+
+template <typename T>
+pluginStatus_t EfficientNMSCustomDispatch(EfficientNMSCustomParameters param, const void* boxesInput, const void* scoresInput,
+    const void* anchorsInput, void* numDetectionsOutput, void* nmsBoxesOutput, void* nmsScoresOutput,
+    void* nmsClassesOutput, void* nmsIndicesOutput, void* workspace, cudaStream_t stream)
+{
+    // Clear Outputs (not all elements will get overwritten by the kernels, so safer to clear everything out)
+    CSC(cudaMemsetAsync(numDetectionsOutput, 0x00, param.batchSize * sizeof(int), stream));
+    CSC(cudaMemsetAsync(nmsScoresOutput, 0x00, param.batchSize * param.numOutputBoxes * sizeof(T), stream));
+    CSC(cudaMemsetAsync(nmsBoxesOutput, 0x00, param.batchSize * param.numOutputBoxes * 4 * sizeof(T), stream));
+    CSC(cudaMemsetAsync(nmsClassesOutput, 0x00, param.batchSize * param.numOutputBoxes * sizeof(int), stream));
+    CSC(cudaMemsetAsync(nmsIndicesOutput, 0xFF, param.batchSize * param.numOutputBoxes * sizeof(int), stream));
+
+    // Empty Inputs
+    if (param.numScoreElements < 1)
+    {
+        return STATUS_SUCCESS;
+    }
+
+    // Counters Workspace
+    size_t workspaceOffset = 0;
+    int countersTotalSize = (3 + 1 + param.numClasses) * param.batchSize;
+    int* topNumData = EfficientNMSCustomWorkspace<int>(workspace, workspaceOffset, countersTotalSize);
+    int* topOffsetsStartData = topNumData + param.batchSize;
+    int* topOffsetsEndData = topNumData + 2 * param.batchSize;
+    int* outputIndexData = topNumData + 3 * param.batchSize;
+    int* outputClassData = topNumData + 4 * param.batchSize;
+    CSC(cudaMemsetAsync(topNumData, 0x00, countersTotalSize * sizeof(int), stream));
+    cudaError_t status = cudaGetLastError();
+    CSC(status, STATUS_FAILURE);
+
+    // Other Buffers Workspace
+    int* topIndexData
+        = EfficientNMSCustomWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    int* topClassData
+        = EfficientNMSCustomWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    int* topAnchorsData
+        = EfficientNMSCustomWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    int* sortedIndexData
+        = EfficientNMSCustomWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    T* topScoresData = EfficientNMSCustomWorkspace<T>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    T* sortedScoresData
+        = EfficientNMSCustomWorkspace<T>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    size_t sortedWorkspaceSize = EfficientNMSCustomSortWorkspaceSize<T>(param.batchSize, param.numScoreElements);
+    char* sortedWorkspaceData = EfficientNMSCustomWorkspace<char>(workspace, workspaceOffset, sortedWorkspaceSize);
+    cub::DoubleBuffer<T> scoresDB(topScoresData, sortedScoresData);
+    cub::DoubleBuffer<int> indexDB(topIndexData, sortedIndexData);
+
+    // Device Specific Properties
+    int device;
+    CSC(cudaGetDevice(&device));
+    struct cudaDeviceProp properties;
+    CSC(cudaGetDeviceProperties(&properties, device));
+    if (properties.regsPerBlock >= 65536)
+    {
+        // Most Devices
+        param.numSelectedBoxes = 5000;
+    }
+    else
+    {
+        // Jetson TX1/TX2
+        param.numSelectedBoxes = 2000;
+    }
+
+    // Kernels
+    status = EfficientNMSCustomFilterLauncher<T>(param, (T*) scoresInput, topNumData, topIndexData, topAnchorsData,
+        topOffsetsStartData, topOffsetsEndData, topScoresData, topClassData, stream);
+    CSC(status, STATUS_FAILURE);
+
+    status = cub::DeviceSegmentedRadixSort::SortPairsDescending(sortedWorkspaceData, sortedWorkspaceSize, scoresDB,
+        indexDB, param.batchSize * param.numScoreElements, param.batchSize, topOffsetsStartData, topOffsetsEndData,
+        param.scoreBits > 0 ? (10 - param.scoreBits) : 0, param.scoreBits > 0 ? 10 : sizeof(T) * 8, stream, false);
+    CSC(status, STATUS_FAILURE);
+
+    status = EfficientNMSCustomLauncher<T>(param, topNumData, outputIndexData, outputClassData, indexDB.Current(),
+        scoresDB.Current(), topClassData, topAnchorsData, boxesInput, anchorsInput, (int*) numDetectionsOutput,
+        (T*) nmsScoresOutput, (int*) nmsClassesOutput, (int*) nmsIndicesOutput, nmsBoxesOutput, stream);
+    CSC(status, STATUS_FAILURE);
+
+    return STATUS_SUCCESS;
+}
+
+pluginStatus_t EfficientNMSCustomInference(EfficientNMSCustomParameters param, const void* boxesInput, const void* scoresInput,
+    const void* anchorsInput, void* numDetectionsOutput, void* nmsBoxesOutput, void* nmsScoresOutput,
+    void* nmsClassesOutput, void* nmsIndicesOutput, void* workspace, cudaStream_t stream)
+{
+    if (param.datatype == DataType::kFLOAT)
+    {
+        param.scoreBits = -1;
+        return EfficientNMSCustomDispatch<float>(param, boxesInput, scoresInput, anchorsInput, numDetectionsOutput,
+            nmsBoxesOutput, nmsScoresOutput, nmsClassesOutput, nmsIndicesOutput, workspace, stream);
+    }
+    else if (param.datatype == DataType::kHALF)
+    {
+        if (param.scoreBits <= 0 || param.scoreBits > 10)
+        {
+            param.scoreBits = -1;
+        }
+        return EfficientNMSCustomDispatch<__half>(param, boxesInput, scoresInput, anchorsInput, numDetectionsOutput,
+            nmsBoxesOutput, nmsScoresOutput, nmsClassesOutput, nmsIndicesOutput, workspace, stream);
+    }
+    else
+    {
+        return STATUS_NOT_SUPPORTED;
+    }
+}
diff --git a/plugin/efficientNMSCustomPlugin/efficientNMSCustomInference.cuh b/plugin/efficientNMSCustomPlugin/efficientNMSCustomInference.cuh
new file mode 100644
index 00000000..491bc1a9
--- /dev/null
+++ b/plugin/efficientNMSCustomPlugin/efficientNMSCustomInference.cuh
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_EFFICIENT_NMS_CUSTOM_INFERENCE_CUH
+#define TRT_EFFICIENT_NMS_CUSTOM_INFERENCE_CUH
+
+#include <cuda_fp16.h>
+
+// FP32 Intrinsics
+
+float __device__ __inline__ exp_mp(const float a)
+{
+    return __expf(a);
+}
+float __device__ __inline__ sigmoid_mp(const float a)
+{
+    return __frcp_rn(__fadd_rn(1.f, __expf(-a)));
+}
+float __device__ __inline__ add_mp(const float a, const float b)
+{
+    return __fadd_rn(a, b);
+}
+float __device__ __inline__ sub_mp(const float a, const float b)
+{
+    return __fsub_rn(a, b);
+}
+float __device__ __inline__ mul_mp(const float a, const float b)
+{
+    return __fmul_rn(a, b);
+}
+bool __device__ __inline__ gt_mp(const float a, const float b)
+{
+    return a > b;
+}
+bool __device__ __inline__ lt_mp(const float a, const float b)
+{
+    return a < b;
+}
+bool __device__ __inline__ lte_mp(const float a, const float b)
+{
+    return a <= b;
+}
+bool __device__ __inline__ gte_mp(const float a, const float b)
+{
+    return a >= b;
+}
+
+#if __CUDA_ARCH__ >= 530
+
+// FP16 Intrinsics
+
+__half __device__ __inline__ exp_mp(const __half a)
+{
+    return hexp(a);
+}
+__half __device__ __inline__ sigmoid_mp(const __half a)
+{
+    return hrcp(__hadd((__half) 1, hexp(__hneg(a))));
+}
+__half __device__ __inline__ add_mp(const __half a, const __half b)
+{
+    return __hadd(a, b);
+}
+__half __device__ __inline__ sub_mp(const __half a, const __half b)
+{
+    return __hsub(a, b);
+}
+__half __device__ __inline__ mul_mp(const __half a, const __half b)
+{
+    return __hmul(a, b);
+}
+bool __device__ __inline__ gt_mp(const __half a, const __half b)
+{
+    return __hgt(a, b);
+}
+bool __device__ __inline__ lt_mp(const __half a, const __half b)
+{
+    return __hlt(a, b);
+}
+bool __device__ __inline__ lte_mp(const __half a, const __half b)
+{
+    return __hle(a, b);
+}
+bool __device__ __inline__ gte_mp(const __half a, const __half b)
+{
+    return __hge(a, b);
+}
+
+#else
+
+// FP16 Fallbacks on older architectures that lack support
+
+__half __device__ __inline__ exp_mp(const __half a)
+{
+    return __float2half(exp_mp(__half2float(a)));
+}
+__half __device__ __inline__ sigmoid_mp(const __half a)
+{
+    return __float2half(sigmoid_mp(__half2float(a)));
+}
+__half __device__ __inline__ add_mp(const __half a, const __half b)
+{
+    return __float2half(add_mp(__half2float(a), __half2float(b)));
+}
+__half __device__ __inline__ sub_mp(const __half a, const __half b)
+{
+    return __float2half(sub_mp(__half2float(a), __half2float(b)));
+}
+__half __device__ __inline__ mul_mp(const __half a, const __half b)
+{
+    return __float2half(mul_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ gt_mp(const __half a, const __half b)
+{
+    return __float2half(gt_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ lt_mp(const __half a, const __half b)
+{
+    return __float2half(lt_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ lte_mp(const __half a, const __half b)
+{
+    return __float2half(lte_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ gte_mp(const __half a, const __half b)
+{
+    return __float2half(gte_mp(__half2float(a), __half2float(b)));
+}
+
+#endif
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCorner;
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCenterSize;
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCorner
+{
+    // For NMS/IOU purposes, YXYX coding is identical to XYXY
+    T y1, x1, y2, x2;
+
+    __device__ void reorder()
+    {
+        if (gt_mp(y1, y2))
+        {
+            // Swap values, so y1 < y2
+            y1 = sub_mp(y1, y2);
+            y2 = add_mp(y1, y2);
+            y1 = sub_mp(y2, y1);
+        }
+        if (gt_mp(x1, x2))
+        {
+            // Swap values, so x1 < x2
+            x1 = sub_mp(x1, x2);
+            x2 = add_mp(x1, x2);
+            x1 = sub_mp(x2, x1);
+        }
+    }
+
+    __device__ BoxCorner<T> clip(T low, T high) const
+    {
+        return {lt_mp(y1, low) ? low : (gt_mp(y1, high) ? high : y1),
+            lt_mp(x1, low) ? low : (gt_mp(x1, high) ? high : x1), lt_mp(y2, low) ? low : (gt_mp(y2, high) ? high : y2),
+            lt_mp(x2, low) ? low : (gt_mp(x2, high) ? high : x2)};
+    }
+
+    __device__ BoxCorner<T> decode(BoxCorner<T> anchor) const
+    {
+        return {add_mp(y1, anchor.y1), add_mp(x1, anchor.x1), add_mp(y2, anchor.y2), add_mp(x2, anchor.x2)};
+    }
+
+    __device__ float area() const
+    {
+        T w = sub_mp(x2, x1);
+        T h = sub_mp(y2, y1);
+        if (lte_mp(h, (T) 0))
+        {
+            return 0;
+        }
+        if (lte_mp(w, (T) 0))
+        {
+            return 0;
+        }
+        return (float) h * (float) w;
+    }
+
+    __device__ operator BoxCenterSize<T>() const
+    {
+        T w = sub_mp(x2, x1);
+        T h = sub_mp(y2, y1);
+        return BoxCenterSize<T>{add_mp(y1, mul_mp((T) 0.5, h)), add_mp(x1, mul_mp((T) 0.5, w)), h, w};
+    }
+
+    __device__ static BoxCorner<T> intersect(BoxCorner<T> a, BoxCorner<T> b)
+    {
+        return {gt_mp(a.y1, b.y1) ? a.y1 : b.y1, gt_mp(a.x1, b.x1) ? a.x1 : b.x1, lt_mp(a.y2, b.y2) ? a.y2 : b.y2,
+            lt_mp(a.x2, b.x2) ? a.x2 : b.x2};
+    }
+};
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCenterSize
+{
+    // For NMS/IOU purposes, YXHW coding is identical to XYWH
+    T y, x, h, w;
+
+    __device__ void reorder() {}
+
+    __device__ BoxCenterSize<T> clip(T low, T high) const
+    {
+        return BoxCenterSize<T>(BoxCorner<T>(*this).clip(low, high));
+    }
+
+    __device__ BoxCenterSize<T> decode(BoxCenterSize<T> anchor) const
+    {
+        return {add_mp(mul_mp(y, anchor.h), anchor.y), add_mp(mul_mp(x, anchor.w), anchor.x),
+            mul_mp(anchor.h, exp_mp(h)), mul_mp(anchor.w, exp_mp(w))};
+    }
+
+    __device__ float area() const
+    {
+        if (h <= (T) 0)
+        {
+            return 0;
+        }
+        if (w <= (T) 0)
+        {
+            return 0;
+        }
+        return (float) h * (float) w;
+    }
+
+    __device__ operator BoxCorner<T>() const
+    {
+        T h2 = mul_mp(h, (T) 0.5);
+        T w2 = mul_mp(w, (T) 0.5);
+        return BoxCorner<T>{sub_mp(y, h2), sub_mp(x, w2), add_mp(y, h2), add_mp(x, w2)};
+    }
+    __device__ static BoxCenterSize<T> intersect(BoxCenterSize<T> a, BoxCenterSize<T> b)
+    {
+        return BoxCenterSize<T>(BoxCorner<T>::intersect(BoxCorner<T>(a), BoxCorner<T>(b)));
+    }
+};
+
+#endif
\ No newline at end of file
diff --git a/plugin/efficientNMSCustomPlugin/efficientNMSCustomInference.h b/plugin/efficientNMSCustomPlugin/efficientNMSCustomInference.h
new file mode 100644
index 00000000..8a16a522
--- /dev/null
+++ b/plugin/efficientNMSCustomPlugin/efficientNMSCustomInference.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_EFFICIENT_NMS_CUSTOM_INFERENCE_H
+#define TRT_EFFICIENT_NMS_CUSTOM_INFERENCE_H
+
+#include "common/plugin.h"
+
+#include "efficientNMSCustomParameters.h"
+
+size_t EfficientNMSCustomWorkspaceSize(int batchSize, int numScoreElements, int numClasses, nvinfer1::DataType datatype);
+
+pluginStatus_t EfficientNMSCustomInference(EfficientNMSCustomParameters param, const void* boxesInput, const void* scoresInput,
+    const void* anchorsInput, void* numDetectionsOutput, void* nmsBoxesOutput, void* nmsScoresOutput,
+    void* nmsClassesOutput, void* nmsIndicesOutput, void* workspace, cudaStream_t stream);
+
+#endif
diff --git a/plugin/efficientNMSCustomPlugin/efficientNMSCustomParameters.h b/plugin/efficientNMSCustomPlugin/efficientNMSCustomParameters.h
new file mode 100644
index 00000000..13a106a4
--- /dev/null
+++ b/plugin/efficientNMSCustomPlugin/efficientNMSCustomParameters.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_EFFICIENT_NMS_CUSTOM_PARAMETERS_H
+#define TRT_EFFICIENT_NMS_CUSTOM_PARAMETERS_H
+
+#include "common/plugin.h"
+
+using namespace nvinfer1::plugin;
+namespace nvinfer1
+{
+namespace plugin
+{
+
+struct EfficientNMSCustomParameters
+{
+    // Related to NMS Options
+    float iouThreshold = 0.5f;
+    float scoreThreshold = 0.5f;
+    int numOutputBoxes = 100;
+    int numOutputBoxesPerClass = -1;
+    bool padOutputBoxesPerClass = false;
+    int backgroundClass = -1;
+    bool scoreSigmoid = false;
+    bool clipBoxes = false;
+    int boxCoding = 0;
+
+    // Related to NMS Internals
+    int numSelectedBoxes = 4096;
+    int scoreBits = -1;
+
+    // Related to Tensor Configuration
+    // (These are set by the various plugin configuration methods, no need to define them during plugin creation.)
+    int batchSize = -1;
+    int numClasses = 1;
+    int numBoxElements = -1;
+    int numScoreElements = -1;
+    int numAnchors = -1;
+    bool shareLocation = true;
+    bool shareAnchors = true;
+    bool boxDecoder = false;
+    nvinfer1::DataType datatype = nvinfer1::DataType::kFLOAT;
+};
+
+} // namespace plugin
+} // namespace nvinfer1
+
+#endif
diff --git a/plugin/efficientNMSCustomPlugin/efficientNMSCustomPlugin.cpp b/plugin/efficientNMSCustomPlugin/efficientNMSCustomPlugin.cpp
new file mode 100644
index 00000000..0f2b47d3
--- /dev/null
+++ b/plugin/efficientNMSCustomPlugin/efficientNMSCustomPlugin.cpp
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "efficientNMSCustomPlugin.h"
+#include "efficientNMSCustomInference.h"
+
+using namespace nvinfer1;
+using nvinfer1::plugin::EfficientNMSCustomPlugin;
+using nvinfer1::plugin::EfficientNMSCustomParameters;
+using nvinfer1::plugin::EfficientNMSCustomPluginCreator;
+
+namespace
+{
+const char* EFFICIENT_NMS_CUSTOM_PLUGIN_VERSION{"1"};
+const char* EFFICIENT_NMS_CUSTOM_PLUGIN_NAME{"EfficientNMSCustom_TRT"};
+} // namespace
+
+EfficientNMSCustomPlugin::EfficientNMSCustomPlugin(EfficientNMSCustomParameters param)
+    : mParam(param)
+{
+}
+
+EfficientNMSCustomPlugin::EfficientNMSCustomPlugin(const void* data, size_t length)
+{
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
+    mParam = read<EfficientNMSCustomParameters>(d);
+    PLUGIN_ASSERT(d == a + length);
+}
+
+const char* EfficientNMSCustomPlugin::getPluginType() const noexcept
+{
+    return EFFICIENT_NMS_CUSTOM_PLUGIN_NAME;
+}
+
+const char* EfficientNMSCustomPlugin::getPluginVersion() const noexcept
+{
+    return EFFICIENT_NMS_CUSTOM_PLUGIN_VERSION;
+}
+
+int EfficientNMSCustomPlugin::getNbOutputs() const noexcept
+{
+    // Standard Plugin Implementation
+    return 5;
+}
+
+int EfficientNMSCustomPlugin::initialize() noexcept
+{
+    return STATUS_SUCCESS;
+}
+
+void EfficientNMSCustomPlugin::terminate() noexcept {}
+
+size_t EfficientNMSCustomPlugin::getSerializationSize() const noexcept
+{
+    return sizeof(EfficientNMSCustomParameters);
+}
+
+void EfficientNMSCustomPlugin::serialize(void* buffer) const noexcept
+{
+    char *d = reinterpret_cast<char*>(buffer), *a = d;
+    write(d, mParam);
+    PLUGIN_ASSERT(d == a + getSerializationSize());
+}
+
+void EfficientNMSCustomPlugin::destroy() noexcept
+{
+    delete this;
+}
+
+void EfficientNMSCustomPlugin::setPluginNamespace(const char* pluginNamespace) noexcept
+{
+    try
+    {
+        mNamespace = pluginNamespace;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+}
+
+const char* EfficientNMSCustomPlugin::getPluginNamespace() const noexcept
+{
+    return mNamespace.c_str();
+}
+
+nvinfer1::DataType EfficientNMSCustomPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
+{
+    // On standard NMS, num_detections and detection_classes use integer outputs
+    if (index == 0 || index == 3 || index == 4)
+    {
+        return nvinfer1::DataType::kINT32;
+    }
+    // All others should use the same datatype as the input
+    return inputTypes[0];
+}
+
+IPluginV2DynamicExt* EfficientNMSCustomPlugin::clone() const noexcept
+{
+    try
+    {
+        auto* plugin = new EfficientNMSCustomPlugin(mParam);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return nullptr;
+}
+
+DimsExprs EfficientNMSCustomPlugin::getOutputDimensions(
+    int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder) noexcept
+{
+    try
+    {
+        DimsExprs out_dim;
+
+        // When pad per class is set, the output size may need to be reduced:
+        // i.e.: outputBoxes = min(outputBoxes, outputBoxesPerClass * numClasses)
+        // As the number of classes may not be static, numOutputBoxes must be a dynamic
+        // expression. The corresponding parameter can not be set at this time, so the
+        // value will be calculated again in configurePlugin() and the param overwritten.
+        const IDimensionExpr* numOutputBoxes = exprBuilder.constant(mParam.numOutputBoxes);
+        if (mParam.padOutputBoxesPerClass && mParam.numOutputBoxesPerClass > 0)
+        {
+            const IDimensionExpr* numOutputBoxesPerClass = exprBuilder.constant(mParam.numOutputBoxesPerClass);
+            const IDimensionExpr* numClasses = inputs[1].d[2];
+            numOutputBoxes = exprBuilder.operation(DimensionOperation::kMIN, *numOutputBoxes,
+                *exprBuilder.operation(DimensionOperation::kPROD, *numOutputBoxesPerClass, *numClasses));
+        }
+
+        // Standard NMS
+        PLUGIN_ASSERT(outputIndex >= 0 && outputIndex <= 4);
+
+        // num_detections
+        if (outputIndex == 0)
+        {
+            out_dim.nbDims = 2;
+            out_dim.d[0] = inputs[0].d[0];
+            out_dim.d[1] = exprBuilder.constant(1);
+        }
+        // detection_boxes
+        else if (outputIndex == 1)
+        {
+            out_dim.nbDims = 3;
+            out_dim.d[0] = inputs[0].d[0];
+            out_dim.d[1] = numOutputBoxes;
+            out_dim.d[2] = exprBuilder.constant(4);
+        }
+        // detection_scores
+        else if (outputIndex == 2)
+        {
+            out_dim.nbDims = 2;
+            out_dim.d[0] = inputs[0].d[0];
+            out_dim.d[1] = numOutputBoxes;
+        }
+        // detection_classes
+        else if (outputIndex == 3)
+        {
+            out_dim.nbDims = 2;
+            out_dim.d[0] = inputs[0].d[0];
+            out_dim.d[1] = numOutputBoxes;
+        }
+        // detection_indices
+        else
+        {
+            out_dim.nbDims = 2;
+            out_dim.d[0] = inputs[0].d[0];
+            out_dim.d[1] = numOutputBoxes;
+        }
+
+        return out_dim;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return DimsExprs{};
+}
+
+bool EfficientNMSCustomPlugin::supportsFormatCombination(
+    int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept
+{
+    if (inOut[pos].format != PluginFormat::kLINEAR)
+    {
+        return false;
+    }
+
+    PLUGIN_ASSERT(nbInputs == 2 || nbInputs == 3);
+    PLUGIN_ASSERT(nbOutputs == 5);
+    if (nbInputs == 2)
+    {
+        PLUGIN_ASSERT(0 <= pos && pos <= 6);
+    }
+    if (nbInputs == 3)
+    {
+        PLUGIN_ASSERT(0 <= pos && pos <= 7);
+    }
+
+    // num_detections and detection_classes output: int
+    const int posOut = pos - nbInputs;
+    if (posOut == 0 || posOut == 3 || posOut == 4)
+    {
+        return inOut[pos].type == DataType::kINT32 && inOut[pos].format == PluginFormat::kLINEAR;
+    }
+
+    // all other inputs/outputs: fp32 or fp16
+    return (inOut[pos].type == DataType::kHALF || inOut[pos].type == DataType::kFLOAT)
+        && (inOut[0].type == inOut[pos].type);
+}
+
+void EfficientNMSCustomPlugin::configurePlugin(
+    const DynamicPluginTensorDesc* in, int nbInputs, const DynamicPluginTensorDesc* out, int nbOutputs) noexcept
+{
+    try
+    {
+        // Accepts two or three inputs
+        // If two inputs: [0] boxes, [1] scores
+        // If three inputs: [0] boxes, [1] scores, [2] anchors
+        PLUGIN_ASSERT(nbInputs == 2 || nbInputs == 3);
+        PLUGIN_ASSERT(nbOutputs == 5);
+
+        mParam.datatype = in[0].desc.type;
+
+        // Shape of scores input should be
+        // [batch_size, num_boxes, num_classes] or [batch_size, num_boxes, num_classes, 1]
+        PLUGIN_ASSERT(in[1].desc.dims.nbDims == 3 || (in[1].desc.dims.nbDims == 4 && in[1].desc.dims.d[3] == 1));
+        mParam.numScoreElements = in[1].desc.dims.d[1] * in[1].desc.dims.d[2];
+        mParam.numClasses = in[1].desc.dims.d[2];
+
+        // When pad per class is set, the total ouput boxes size may need to be reduced.
+        // This operation is also done in getOutputDimension(), but for dynamic shapes, the
+        // numOutputBoxes param can't be set until the number of classes is fully known here.
+        if (mParam.padOutputBoxesPerClass && mParam.numOutputBoxesPerClass > 0)
+        {
+            if (mParam.numOutputBoxesPerClass * mParam.numClasses < mParam.numOutputBoxes)
+            {
+                mParam.numOutputBoxes = mParam.numOutputBoxesPerClass * mParam.numClasses;
+            }
+        }
+
+        // Shape of boxes input should be
+        // [batch_size, num_boxes, 4] or [batch_size, num_boxes, 1, 4] or [batch_size, num_boxes, num_classes, 4]
+        PLUGIN_ASSERT(in[0].desc.dims.nbDims == 3 || in[0].desc.dims.nbDims == 4);
+        if (in[0].desc.dims.nbDims == 3)
+        {
+            PLUGIN_ASSERT(in[0].desc.dims.d[2] == 4);
+            mParam.shareLocation = true;
+            mParam.numBoxElements = in[0].desc.dims.d[1] * in[0].desc.dims.d[2];
+        }
+        else
+        {
+            mParam.shareLocation = (in[0].desc.dims.d[2] == 1);
+            PLUGIN_ASSERT(in[0].desc.dims.d[2] == mParam.numClasses || mParam.shareLocation);
+            PLUGIN_ASSERT(in[0].desc.dims.d[3] == 4);
+            mParam.numBoxElements = in[0].desc.dims.d[1] * in[0].desc.dims.d[2] * in[0].desc.dims.d[3];
+        }
+        mParam.numAnchors = in[0].desc.dims.d[1];
+
+        if (nbInputs == 2)
+        {
+            // Only two inputs are used, disable the fused box decoder
+            mParam.boxDecoder = false;
+        }
+        if (nbInputs == 3)
+        {
+            // All three inputs are used, enable the box decoder
+            // Shape of anchors input should be
+            // Constant shape: [1, numAnchors, 4] or [batch_size, numAnchors, 4]
+            PLUGIN_ASSERT(in[2].desc.dims.nbDims == 3);
+            mParam.boxDecoder = true;
+            mParam.shareAnchors = (in[2].desc.dims.d[0] == 1);
+        }
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+}
+
+size_t EfficientNMSCustomPlugin::getWorkspaceSize(
+    const PluginTensorDesc* inputs, int nbInputs, const PluginTensorDesc* outputs, int nbOutputs) const noexcept
+{
+    int batchSize = inputs[1].dims.d[0];
+    int numScoreElements = inputs[1].dims.d[1] * inputs[1].dims.d[2];
+    int numClasses = inputs[1].dims.d[2];
+    return EfficientNMSCustomWorkspaceSize(batchSize, numScoreElements, numClasses, mParam.datatype);
+}
+
+int EfficientNMSCustomPlugin::enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
+    const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept
+{
+    try
+    {
+        mParam.batchSize = inputDesc[0].dims.d[0];
+
+        // Standard NMS Operation
+        const void* const boxesInput = inputs[0];
+        const void* const scoresInput = inputs[1];
+        const void* const anchorsInput = mParam.boxDecoder ? inputs[2] : nullptr;
+
+        void* numDetectionsOutput = outputs[0];
+        void* nmsBoxesOutput = outputs[1];
+        void* nmsScoresOutput = outputs[2];
+        void* nmsClassesOutput = outputs[3];
+        void* nmsIndicesOutput = outputs[4];
+
+        return EfficientNMSCustomInference(mParam, boxesInput, scoresInput, anchorsInput, numDetectionsOutput, nmsBoxesOutput,
+            nmsScoresOutput, nmsClassesOutput, nmsIndicesOutput, workspace, stream);
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return -1;
+}
+
+// Standard NMS Plugin Operation
+
+EfficientNMSCustomPluginCreator::EfficientNMSCustomPluginCreator()
+    : mParam{}
+{
+    mPluginAttributes.clear();
+    mPluginAttributes.emplace_back(PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+    mPluginAttributes.emplace_back(PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+    mPluginAttributes.emplace_back(PluginField("max_output_boxes", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("background_class", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("score_activation", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("box_coding", nullptr, PluginFieldType::kINT32, 1));
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* EfficientNMSCustomPluginCreator::getPluginName() const noexcept
+{
+    return EFFICIENT_NMS_CUSTOM_PLUGIN_NAME;
+}
+
+const char* EfficientNMSCustomPluginCreator::getPluginVersion() const noexcept
+{
+    return EFFICIENT_NMS_CUSTOM_PLUGIN_VERSION;
+}
+
+const PluginFieldCollection* EfficientNMSCustomPluginCreator::getFieldNames() noexcept
+{
+    return &mFC;
+}
+
+IPluginV2DynamicExt* EfficientNMSCustomPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) noexcept
+{
+    try
+    {
+        const PluginField* fields = fc->fields;
+        for (int i = 0; i < fc->nbFields; ++i)
+        {
+            const char* attrName = fields[i].name;
+            if (!strcmp(attrName, "score_threshold"))
+            {
+                PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+                mParam.scoreThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            if (!strcmp(attrName, "iou_threshold"))
+            {
+                PLUGIN_ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+                mParam.iouThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            if (!strcmp(attrName, "max_output_boxes"))
+            {
+                PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+                mParam.numOutputBoxes = *(static_cast<const int*>(fields[i].data));
+            }
+            if (!strcmp(attrName, "background_class"))
+            {
+                PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+                mParam.backgroundClass = *(static_cast<const int*>(fields[i].data));
+            }
+            if (!strcmp(attrName, "score_activation"))
+            {
+                auto scoreSigmoid = *(static_cast<const int32_t*>(fields[i].data));
+                PLUGIN_VALIDATE(scoreSigmoid == 0 || scoreSigmoid == 1);
+                mParam.scoreSigmoid = static_cast<bool>(scoreSigmoid);
+            }
+            if (!strcmp(attrName, "box_coding"))
+            {
+                PLUGIN_ASSERT(fields[i].type == PluginFieldType::kINT32);
+                mParam.boxCoding = *(static_cast<const int*>(fields[i].data));
+            }
+        }
+
+        auto* plugin = new EfficientNMSCustomPlugin(mParam);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return nullptr;
+}
+
+IPluginV2DynamicExt* EfficientNMSCustomPluginCreator::deserializePlugin(
+    const char* name, const void* serialData, size_t serialLength) noexcept
+{
+    try
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call EfficientNMSCustomPlugin::destroy()
+        auto* plugin = new EfficientNMSCustomPlugin(serialData, serialLength);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return nullptr;
+}
diff --git a/plugin/efficientNMSCustomPlugin/efficientNMSCustomPlugin.h b/plugin/efficientNMSCustomPlugin/efficientNMSCustomPlugin.h
new file mode 100644
index 00000000..2313d9d5
--- /dev/null
+++ b/plugin/efficientNMSCustomPlugin/efficientNMSCustomPlugin.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TRT_EFFICIENT_NMS_CUSTOM_PLUGIN_H
+#define TRT_EFFICIENT_NMS_CUSTOM_PLUGIN_H
+
+#include <vector>
+
+#include "common/plugin.h"
+#include "efficientNMSCustomParameters.h"
+
+using namespace nvinfer1::plugin;
+namespace nvinfer1
+{
+namespace plugin
+{
+
+class EfficientNMSCustomPlugin : public IPluginV2DynamicExt
+{
+public:
+    explicit EfficientNMSCustomPlugin(EfficientNMSCustomParameters param);
+    EfficientNMSCustomPlugin(const void* data, size_t length);
+    ~EfficientNMSCustomPlugin() override = default;
+
+    // IPluginV2 methods
+    const char* getPluginType() const noexcept override;
+    const char* getPluginVersion() const noexcept override;
+    int getNbOutputs() const noexcept override;
+    int initialize() noexcept override;
+    void terminate() noexcept override;
+    size_t getSerializationSize() const noexcept override;
+    void serialize(void* buffer) const noexcept override;
+    void destroy() noexcept override;
+    void setPluginNamespace(const char* libNamespace) noexcept override;
+    const char* getPluginNamespace() const noexcept override;
+
+    // IPluginV2Ext methods
+    nvinfer1::DataType getOutputDataType(
+        int index, const nvinfer1::DataType* inputType, int nbInputs) const noexcept override;
+
+    // IPluginV2DynamicExt methods
+    IPluginV2DynamicExt* clone() const noexcept override;
+    DimsExprs getOutputDimensions(
+        int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder) noexcept override;
+    bool supportsFormatCombination(
+        int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept override;
+    void configurePlugin(const DynamicPluginTensorDesc* in, int nbInputs, const DynamicPluginTensorDesc* out,
+        int nbOutputs) noexcept override;
+    size_t getWorkspaceSize(const PluginTensorDesc* inputs, int nbInputs, const PluginTensorDesc* outputs,
+        int nbOutputs) const noexcept override;
+    int enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc, const void* const* inputs,
+        void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
+
+protected:
+    EfficientNMSCustomParameters mParam{};
+    std::string mNamespace;
+};
+
+// Standard NMS Plugin Operation
+class EfficientNMSCustomPluginCreator : public nvinfer1::pluginInternal::BaseCreator
+{
+public:
+    EfficientNMSCustomPluginCreator();
+    ~EfficientNMSCustomPluginCreator() override = default;
+
+    const char* getPluginName() const noexcept override;
+    const char* getPluginVersion() const noexcept override;
+    const PluginFieldCollection* getFieldNames() noexcept override;
+
+    IPluginV2DynamicExt* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
+    IPluginV2DynamicExt* deserializePlugin(
+        const char* name, const void* serialData, size_t serialLength) noexcept override;
+
+protected:
+    PluginFieldCollection mFC;
+    EfficientNMSCustomParameters mParam;
+    std::vector<PluginField> mPluginAttributes;
+    std::string mPluginName;
+};
+
+} // namespace plugin
+} // namespace nvinfer1
+
+#endif // TRT_EFFICIENT_NMS_CUSTOM_PLUGIN_H
diff --git a/plugin/efficientNMSLandmarkPlugin/CMakeLists.txt b/plugin/efficientNMSLandmarkPlugin/CMakeLists.txt
new file mode 100644
index 00000000..53b70a7e
--- /dev/null
+++ b/plugin/efficientNMSLandmarkPlugin/CMakeLists.txt
@@ -0,0 +1,21 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB SRCS *.cpp)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
+file(GLOB CU_SRCS *.cu)
+set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS})
+set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} PARENT_SCOPE)
diff --git a/plugin/efficientNMSLandmarkPlugin/README.md b/plugin/efficientNMSLandmarkPlugin/README.md
new file mode 100644
index 00000000..ee5657c8
--- /dev/null
+++ b/plugin/efficientNMSLandmarkPlugin/README.md
@@ -0,0 +1,159 @@
+# Efficient NMS Plugin
+
+#### Table of Contents
+- [Description](#description)
+- [Structure](#structure)
+  * [Inputs](#inputs)
+  * [Dynamic Shape Support](#dynamic-shape-support)
+  * [Box Coding Type](#box-coding-type)
+  * [Outputs](#outputs)
+  * [Parameters](#parameters)
+- [Algorithm](#algorithm)
+  * [Process Description](#process-description)
+  * [Performance Tuning](#performance-tuning)
+  * [Additional Resources](#additional-resources)
+- [License](#license)
+
+## Description
+
+This TensorRT plugin implements an efficient algorithm to perform Non Maximum Suppression for object detection networks.
+
+This plugin is primarily intended for using with EfficientDet on TensorRT, as this network is particularly sensitive to the latencies introduced by slower NMS implementations. However, the plugin is generic enough that it will work correctly for other detections architectures, such as SSD or FasterRCNN.
+
+## Structure
+
+### Inputs
+
+The plugin has two modes of operation, depending on the given input data. The plugin will automatically detect which mode to operate as, depending on the number of inputs it receives, as follows:
+
+1. **Standard NMS Mode:** Only two input tensors are given, (i) the bounding box coordinates and (ii) the corresponding classification scores for each box.
+
+2. **Fused Box Decoder Mode:** Three input tensors are given, (i) the raw localization predictions for each box originating directly from the localization head of the network, (ii) the corresponding classification scores originating from the classification head of the network, and (iii) the default anchor box coordinates usually hardcoded as constant tensors in the network.
+
+Most object detection networks work by generating raw predictions from a "localization head" which adjust the coordinates of standard non-learned anchor coordinates to produce a tighter fitting bounding box. This process is called "box decoding", and it usually involves a large number of element-wise operations to transform the anchors to final box coordinates. As this can involve exponential operations on a large number of anchors, it can be computationally expensive, so this plugin gives the option of fusing the box decoder within the NMS operation which can be done in a far more efficient manner, resulting in lower latency for the network.
+
+#### Boxes Input
+> **Input Shape:** `[batch_size, number_boxes, 4]` or `[batch_size, number_boxes, number_classes, 4]`
+>
+> **Data Type:** `float32` or `float16`
+
+The boxes input can have 3 dimensions in case a single box prediction is produced for all classes (such as in EfficientDet or SSD), or 4 dimensions when separate box predictions are generated for each class (such as in FasterRCNN), in which case `number_classes` >= 1 and must match the number of classes in the scores input. The final dimension represents the four coordinates that define the bounding box prediction.
+
+For *Standard NMS* mode, this tensor should contain the final box coordinates for each predicted detection. For *Fused Box Decoder* mode, this tensor should have the raw localization predictions. In either case, this data is given as `4` coordinates which makes up the final shape dimension.
+
+#### Scores Input
+> **Input Shape:** `[batch_size, number_boxes, number_classes]`
+>
+> **Data Type:** `float32` or `float16`
+
+The scores input has `number_classes` elements with the predicted scores for each candidate class for each of the `number_boxes` anchor boxes.
+
+Usually, the score values will have passed through a sigmoid activation function before reaching the NMS operation. However, as an optimization, the pre-sigmoid raw scores can also be provided to the NMS plugin to reduce overall network latency. If raw scores are given, enable the `score_activation` parameter so they are processed accordingly.
+
+#### Anchors Input (Optional)
+> **Input Shape:** `[1, number_boxes, 4]` or `[batch_size, number_boxes, 4]`
+>
+> **Data Type:** `float32` or `float16`
+
+Only used in *Fused Box Decoder* mode. It is much more efficient to perform the box decoding within this plugin. In this case, the boxes input will be treated as the raw localization head box corrections, and this third input should contain the default anchor/prior box coordinates.
+
+When used, the input must have 3 dimensions, where the first one may be either `1` in case anchors are constant for all images in a batch, or `batch_size` in case each image has different anchors -- such as in the box refinement NMS of FasterRCNN's second stage.
+
+### Dynamic Shape Support
+
+Most input shape dimensions, namely `batch_size`, `number_boxes`, and `number_classes`, for all inputs can be defined dynamically at runtime if the TensorRT engine is built with dynamic input shapes. However, once defined, these dimensions must match across all tensors that use them (e.g. the same `number_boxes` dimension must be given for both boxes and scores, etc.)
+
+### Box Coding Type
+Different object detection networks represent their box coordinate system differently. The two types supported by this plugin are:
+
+1. **BoxCorners:** The four coordinates represent `[x1, y1, x2, y2]` values, where each x,y pair defines the top-left and bottom-right corners of a bounding box.
+2. **BoxCenterSize:** The four coordinates represent `[x, y, w, h]` values, where the x,y pair define the box center location, and the w,h pair define its width and height.
+
+Note that for NMS purposes, horizontal and vertical coordinates are fully interchangeable. TensorFlow-trained networks, for example, often uses vertical-first coordinates such as `[y1, x1, y2, x2]`, but this coordinate system will work equally well under the BoxCorner coding. Similarly, `[y, x, h, w]` will be properly covered by the BoxCornerSize coding.
+
+In *Fused Box Decoder* mode, the boxes and anchor tensors should both use the same coding.
+
+### Outputs
+
+The following four output tensors are generated:
+
+- **num_detections:**
+  This is a `[batch_size, 1]` tensor of data type `int32`. The last dimension is a scalar indicating the number of valid detections per batch image. It can be less than `max_output_boxes`. Only the top `num_detections[i]` entries in `nms_boxes[i]`, `nms_scores[i]` and `nms_classes[i]` are valid.
+
+- **detection_boxes:**
+  This is a `[batch_size, max_output_boxes, 4]` tensor of data type `float32` or `float16`, containing the coordinates of non-max suppressed boxes. The output coordinates will always be in BoxCorner format, regardless of the input code type.
+
+- **detection_scores:**
+  This is a `[batch_size, max_output_boxes]` tensor of data type `float32` or `float16`, containing the scores for the boxes.
+
+- **detection_classes:**
+  This is a `[batch_size, max_output_boxes]` tensor of data type `int32`, containing the classes for the boxes.
+
+### Parameters
+
+| Type     | Parameter                | Description
+|----------|--------------------------|--------------------------------------------------------
+|`float`   |`score_threshold` *       |The scalar threshold for score (low scoring boxes are removed).
+|`float`   |`iou_threshold`           |The scalar threshold for IOU (additional boxes that have high IOU overlap with previously selected boxes are removed).
+|`int`     |`max_output_boxes`        |The maximum number of detections to output per image.
+|`int`     |`background_class`        |The label ID for the background class. If there is no background class, set it to `-1`.
+|`bool`    |`score_activation` *      |Set to true to apply sigmoid activation to the confidence scores during NMS operation.
+|`int`     |`box_coding`              |Coding type used for boxes (and anchors if applicable), 0 = BoxCorner, 1 = BoxCenterSize.
+
+Parameters marked with a `*` have a non-negligible effect on runtime latency. See the [Performance Tuning](#performance-tuning) section below for more details on how to set them optimally.
+
+## Algorithm
+
+### Process Description
+
+The NMS algorithm in this plugin first filters the scores below the given `scoreThreshold`. This subset of scores is then sorted, and their corresponding boxes are then further filtered out by removing boxes that overlap each other with an IOU above the given `iouThreshold`.
+
+The algorithm launcher and its relevant CUDA kernels are all defined in the `EfficientNMSLandmarkInference.cu` file.
+
+Specifically, the NMS algorithm does the following:
+
+- The scores are filtered with the `score_threshold` parameter to reject any scores below the score threshold, while maintaining indexing to cross-reference these scores to their corresponding box coordinates. This is done with the `EfficientNMSLandmarkFilter` CUDA kernel.
+
+- If too many elements are kept, due to a very low (or zero) score threshold, the filter operation can become a bottleneck due to the atomic operations involved. To mitigate this, a fallback kernel `EfficientNMSLandmarkDenseIndex` is used instead which passes all the score elements densely packed and indexed. This method is heuristically selected only if the score threshold is less than 0.007.
+
+- The selected scores that remain after filtering are sorted in descending order. The indexing is carefully handled to still maintain score to box relationships after sorting.
+
+- After sorting, the highest 4096 scores are processed by the `EfficientNMSLandmark` CUDA kernel. This algorithm uses the index data maintained throughout the previous steps to find the boxes corresponding to the remaining scores. If the fused box decoder is being used, decoding will happen until this stage, where only the top scoring boxes need to be decoded.
+
+- The NMS kernel uses an efficient filtering algorithm that largely reduces the number of IOU overlap cross-checks between box pairs. The boxes that survive the IOU filtering finally pass through to the output results. At this stage, the sigmoid activation is applied to only the final remaining scores, if `score_activation` is enabled, thereby greatly reducing the amount of sigmoid calculations required otherwise.
+
+### Performance Tuning
+
+The plugin implements a very efficient NMS algorithm which largely reduces the latency of this operation in comparison to other NMS plugins. However, there are certain considerations that can help to better fine tune its performance:
+
+#### Choosing the Score Threshold
+
+The algorithm is highly sensitive to the selected `score_threshold` parameter. With a higher threshold, fewer elements need to be processed and so the algorithm runs much faster. Therefore, it's beneficial to always select the highest possible score threshold that fulfills the application requirements. Threshold values lower than approximately 0.01 may cause substantially higher latency.
+
+#### Using Sigmoid Activation
+
+Depending on network configuration, it is usually more efficient to provide raw scores (pre-sigmoid) to the NMS plugin scores input, and enable the `score_activation` parameter. Doing so applies a sigmoid activation only to the last `max_output_boxes` selected scores, instead of all the predicted scores, largely reducing the computational cost.
+
+#### Using the Fused Box Decoder
+
+When using networks with many anchors, such as EfficientDet or SSD, it may be more efficient to do box decoding within the NMS plugin. For this, pass the raw box predictions as the boxes input, and the default anchor coordinates as the optional third input to the plugin.
+
+### Additional Resources
+
+The following resources provide a deeper understanding of the NMS algorithm:
+
+#### Networks
+- [EfficientDet](https://arxiv.org/abs/1911.09070)
+- [SSD: Single Shot MultiBox Detector](https://arxiv.org/abs/1512.02325)
+- [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks](https://arxiv.org/abs/1506.01497)
+- [Mask R-CNN](https://arxiv.org/abs/1703.06870)
+
+
+#### Documentation
+- [NMS algorithm](https://www.coursera.org/lecture/convolutional-neural-networks/non-max-suppression-dvrjH)
+- [NonMaxSuppression ONNX Op](https://github.com/onnx/onnx/blob/master/docs/Operators.md#NonMaxSuppression)
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html)
+documentation.
diff --git a/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkInference.cu b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkInference.cu
new file mode 100644
index 00000000..33628734
--- /dev/null
+++ b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkInference.cu
@@ -0,0 +1,704 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common/bboxUtils.h"
+#include "cub/cub.cuh"
+#include "cuda_runtime_api.h"
+
+#include "efficientNMSLandmarkInference.cuh"
+#include "efficientNMSLandmarkInference.h"
+
+#define NMS_TILES 5
+
+using namespace nvinfer1;
+
+template <typename T>
+__device__ float IOU(EfficientNMSLandmarkParameters param, BoxCorner<T> box1, BoxCorner<T> box2)
+{
+    // Regardless of the selected box coding, IOU is always performed in BoxCorner coding.
+    // The boxes are copied so that they can be reordered without affecting the originals.
+    BoxCorner<T> b1 = box1;
+    BoxCorner<T> b2 = box2;
+    b1.reorder();
+    b2.reorder();
+    float intersectArea = BoxCorner<T>::intersect(b1, b2).area();
+    if (intersectArea <= 0.f)
+    {
+        return 0.f;
+    }
+    float unionArea = b1.area() + b2.area() - intersectArea;
+    if (unionArea <= 0.f)
+    {
+        return 0.f;
+    }
+    return intersectArea / unionArea;
+}
+
+template <typename T, typename Tb>
+__device__ BoxCorner<T> DecodeBoxesLandmark(EfficientNMSLandmarkParameters param, int boxIdx, int anchorIdx,
+    const Tb* __restrict__ boxesInput, const Tb* __restrict__ anchorsInput)
+{
+    // The inputs will be in the selected coding format, as well as the decoding function. But the decoded box
+    // will always be returned as BoxCorner.
+    Tb box = boxesInput[boxIdx];
+    if (!param.boxDecoder)
+    {
+        return BoxCorner<T>(box);
+    }
+    Tb anchor = anchorsInput[anchorIdx];
+    box.reorder();
+    anchor.reorder();
+    return BoxCorner<T>(box.decode(anchor));
+}
+
+template <typename T, typename Tb>
+__device__ void MapNMSLandmarkData(EfficientNMSLandmarkParameters param, int idx, int imageIdx,
+    const Tb* __restrict__ boxesInput, const T* __restrict__ landmarksInput, const Tb* __restrict__ anchorsInput,
+    const int* __restrict__ topClassData, const int* __restrict__ topAnchorsData, const int* __restrict__ topNumData,
+    const T* __restrict__ sortedScoresData, const int* __restrict__ sortedIndexData, T& scoreMap, int& classMap,
+    BoxCorner<T>& boxMap, Landmark<T>& landmarkMap, int& boxIdxMap)
+{
+    // idx: Holds the NMS box index, within the current batch.
+    // idxSort: Holds the batched NMS box index, which indexes the (filtered, but sorted) score buffer.
+    // scoreMap: Holds the score that corresponds to the indexed box being processed by NMS.
+    if (idx >= topNumData[imageIdx])
+    {
+        return;
+    }
+    int idxSort = imageIdx * param.numScoreElements + idx;
+    scoreMap = sortedScoresData[idxSort];
+
+    // idxMap: Holds the re-mapped index, which indexes the (filtered, but unsorted) buffers.
+    // classMap: Holds the class that corresponds to the idx'th sorted score being processed by NMS.
+    // anchorMap: Holds the anchor that corresponds to the idx'th sorted score being processed by NMS.
+    int idxMap = imageIdx * param.numScoreElements + sortedIndexData[idxSort];
+    classMap = topClassData[idxMap];
+    int anchorMap = topAnchorsData[idxMap];
+
+    // boxIdxMap: Holds the re-re-mapped index, which indexes the (unfiltered, and unsorted) boxes input buffer.
+    boxIdxMap = -1;
+    if (param.shareLocation) // Shape of boxesInput: [batchSize, numAnchors, 1, 4]
+    {
+        boxIdxMap = imageIdx * param.numAnchors + anchorMap;
+    }
+    else // Shape of boxesInput: [batchSize, numAnchors, numClasses, 4]
+    {
+        int batchOffset = imageIdx * param.numAnchors * param.numClasses;
+        int anchorOffset = anchorMap * param.numClasses;
+        boxIdxMap = batchOffset + anchorOffset + classMap;
+    }
+    // anchorIdxMap: Holds the re-re-mapped index, which indexes the (unfiltered, and unsorted) anchors input buffer.
+    int anchorIdxMap = -1;
+    if (param.shareAnchors) // Shape of anchorsInput: [1, numAnchors, 4]
+    {
+        anchorIdxMap = anchorMap;
+    }
+    else // Shape of anchorsInput: [batchSize, numAnchors, 4]
+    {
+        anchorIdxMap = imageIdx * param.numAnchors + anchorMap;
+    }
+    // boxMap: Holds the box that corresponds to the idx'th sorted score being processed by NMS.
+    boxMap = DecodeBoxesLandmark<T, Tb>(param, boxIdxMap, anchorIdxMap, boxesInput, anchorsInput);
+
+    for (size_t idx_lmk = 0; idx_lmk < 10; idx_lmk++)
+        landmarkMap.lmk[idx_lmk] = landmarksInput[boxIdxMap * 10 + idx_lmk];
+}
+
+template <typename T>
+__device__ void WriteNMSLandmarkResult(EfficientNMSLandmarkParameters param, int* __restrict__ numDetectionsOutput,
+    T* __restrict__ nmsScoresOutput, int* __restrict__ nmsClassesOutput, BoxCorner<T>* __restrict__ nmsBoxesOutput,
+    T* __restrict__ nmsLandmarksOutput, T threadScore, int threadClass, BoxCorner<T> threadBox,
+    Landmark<T> threadLandmark, int imageIdx, unsigned int resultsCounter)
+{
+    int outputIdx = imageIdx * param.numOutputBoxes + resultsCounter - 1;
+    if (param.scoreSigmoid)
+    {
+        nmsScoresOutput[outputIdx] = sigmoid_mp(threadScore);
+    }
+    else if (param.scoreBits > 0)
+    {
+        nmsScoresOutput[outputIdx] = add_mp(threadScore, (T) -1);
+    }
+    else
+    {
+        nmsScoresOutput[outputIdx] = threadScore;
+    }
+    nmsClassesOutput[outputIdx] = threadClass;
+    if (param.clipBoxes)
+    {
+        nmsBoxesOutput[outputIdx] = threadBox.clip((T) 0, (T) 1);
+    }
+    else
+    {
+        nmsBoxesOutput[outputIdx] = threadBox;
+    }
+
+    for (size_t idx_lmk = 0; idx_lmk < 10; idx_lmk++)
+        nmsLandmarksOutput[outputIdx * 10 + idx_lmk] = threadLandmark.lmk[idx_lmk];
+
+    numDetectionsOutput[imageIdx] = resultsCounter;
+}
+
+template <typename T, typename Tb>
+__global__ void EfficientNMSLandmark(EfficientNMSLandmarkParameters param, const int* topNumData, int* outputIndexData,
+    int* outputClassData, const int* sortedIndexData, const T* __restrict__ sortedScoresData,
+    const int* __restrict__ topClassData, const int* __restrict__ topAnchorsData, const Tb* __restrict__ boxesInput,
+    const T* __restrict__ landmarksInput, const Tb* __restrict__ anchorsInput, int* __restrict__ numDetectionsOutput,
+    T* __restrict__ nmsScoresOutput, int* __restrict__ nmsClassesOutput, BoxCorner<T>* __restrict__ nmsBoxesOutput,
+    T* __restrict__ nmsLandmarksOutput)
+{
+    unsigned int thread = threadIdx.x;
+    unsigned int imageIdx = blockIdx.y;
+    unsigned int tileSize = blockDim.x;
+    if (imageIdx >= param.batchSize)
+    {
+        return;
+    }
+
+    int numSelectedBoxes = min(topNumData[imageIdx], param.numSelectedBoxes);
+    int numTiles = (numSelectedBoxes + tileSize - 1) / tileSize;
+    if (thread >= numSelectedBoxes)
+    {
+        return;
+    }
+
+    __shared__ int blockState;
+    __shared__ unsigned int resultsCounter;
+    if (thread == 0)
+    {
+        blockState = 0;
+        resultsCounter = 0;
+    }
+
+    int threadState[NMS_TILES];
+    unsigned int boxIdx[NMS_TILES];
+    T threadScore[NMS_TILES];
+    int threadClass[NMS_TILES];
+    BoxCorner<T> threadBox[NMS_TILES];
+    Landmark<T> threadLandmark[NMS_TILES];
+    int boxIdxMap[NMS_TILES];
+    for (int tile = 0; tile < numTiles; tile++)
+    {
+        threadState[tile] = 0;
+        boxIdx[tile] = thread + tile * blockDim.x;
+        MapNMSLandmarkData<T, Tb>(param, boxIdx[tile], imageIdx, boxesInput, landmarksInput, anchorsInput, topClassData,
+            topAnchorsData, topNumData, sortedScoresData, sortedIndexData, threadScore[tile], threadClass[tile],
+            threadBox[tile], threadLandmark[tile], boxIdxMap[tile]);
+    }
+
+    // Iterate through all boxes to NMS against.
+    for (int i = 0; i < numSelectedBoxes; i++)
+    {
+        int tile = i / tileSize;
+
+        if (boxIdx[tile] == i)
+        {
+            // Iteration lead thread, figure out what the other threads should do,
+            // this will be signaled via the blockState shared variable.
+            if (threadState[tile] == -1)
+            {
+                // Thread already dead, this box was already dropped in a previous iteration,
+                // because it had a large IOU overlap with another lead thread previously, so
+                // it would never be kept anyway, therefore it can safely be skip all IOU operations
+                // in this iteration.
+                blockState = -1; // -1 => Signal all threads to skip iteration
+            }
+            else if (threadState[tile] == 0)
+            {
+                // As this box will be kept, this is a good place to find what index in the results buffer it
+                // should have, as this allows to perform an early loop exit if there are enough results.
+                if (resultsCounter >= param.numOutputBoxes)
+                {
+                    blockState = -2; // -2 => Signal all threads to do an early loop exit.
+                }
+                else
+                {
+                    // Thread is still alive, because it has not had a large enough IOU overlap with
+                    // any other kept box previously. Therefore, this box will be kept for sure. However,
+                    // we need to check against all other subsequent boxes from this position onward,
+                    // to see how those other boxes will behave in future iterations.
+                    blockState = 1;        // +1 => Signal all (higher index) threads to calculate IOU against this box
+                    threadState[tile] = 1; // +1 => Mark this box's thread to be kept and written out to results
+
+                    // If the numOutputBoxesPerClass check is enabled, write the result only if the limit for this
+                    // class on this image has not been reached yet. Other than (possibly) skipping the write, this
+                    // won't affect anything else in the NMS threading.
+                    bool write = true;
+                    if (param.numOutputBoxesPerClass >= 0)
+                    {
+                        int classCounterIdx = imageIdx * param.numClasses + threadClass[tile];
+                        write = (outputClassData[classCounterIdx] < param.numOutputBoxesPerClass);
+                        outputClassData[classCounterIdx]++;
+                    }
+                    if (write)
+                    {
+                        // This branch is visited by one thread per iteration, so it's safe to do non-atomic increments.
+                        resultsCounter++;
+                        WriteNMSLandmarkResult<T>(param, numDetectionsOutput, nmsScoresOutput, nmsClassesOutput,
+                            nmsBoxesOutput, nmsLandmarksOutput, threadScore[tile], threadClass[tile], threadBox[tile],
+                            threadLandmark[tile], imageIdx, resultsCounter);
+                    }
+                }
+            }
+            else
+            {
+                // This state should never be reached, but just in case...
+                blockState = 0; // 0 => Signal all threads to not do any updates, nothing happens.
+            }
+        }
+
+        __syncthreads();
+
+        if (blockState == -2)
+        {
+            // This is the signal to exit from the loop.
+            return;
+        }
+
+        if (blockState == -1)
+        {
+            // This is the signal for all threads to just skip this iteration, as no IOU's need to be checked.
+            continue;
+        }
+
+        // Grab a box and class to test the current box against. The test box corresponds to iteration i,
+        // therefore it will have a lower index than the current thread box, and will therefore have a higher score
+        // than the current box because it's located "before" in the sorted score list.
+        T testScore;
+        int testClass;
+        BoxCorner<T> testBox;
+        Landmark<T> testLandmark;
+        int testBoxIdxMap;
+        MapNMSLandmarkData<T, Tb>(param, i, imageIdx, boxesInput, landmarksInput, anchorsInput, topClassData,
+            topAnchorsData, topNumData, sortedScoresData, sortedIndexData, testScore, testClass, testBox, testLandmark,
+            testBoxIdxMap);
+
+        for (int tile = 0; tile < numTiles; tile++)
+        {
+            // IOU
+            if (boxIdx[tile] > i && // Make sure two different boxes are being tested, and that it's a higher index;
+                boxIdx[tile] < numSelectedBoxes && // Make sure the box is within numSelectedBoxes;
+                blockState == 1 &&                 // Signal that allows IOU checks to be performed;
+                threadState[tile] == 0 &&          // Make sure this box hasn't been either dropped or kept already;
+                threadClass[tile] == testClass &&  // Compare only boxes of matching classes;
+                lte_mp(threadScore[tile], testScore) && // Make sure the sorting order of scores is as expected;
+                IOU<T>(param, threadBox[tile], testBox) >= param.iouThreshold) // And... IOU overlap.
+            {
+                // Current box overlaps with the box tested in this iteration, this box will be skipped.
+                threadState[tile] = -1; // -1 => Mark this box's thread to be dropped.
+            }
+        }
+    }
+}
+
+template <typename T>
+cudaError_t EfficientNMSLandmarkLauncher(EfficientNMSLandmarkParameters& param, int* topNumData, int* outputIndexData,
+    int* outputClassData, int* sortedIndexData, T* sortedScoresData, int* topClassData, int* topAnchorsData,
+    const void* boxesInput, const void* landmarksInput, const void* anchorsInput, int* numDetectionsOutput,
+    T* nmsScoresOutput, int* nmsClassesOutput, void* nmsBoxesOutput, void* nmsLandmarksOutput, cudaStream_t stream)
+{
+    unsigned int tileSize = param.numSelectedBoxes / NMS_TILES;
+    if (param.numSelectedBoxes <= 512)
+    {
+        tileSize = 512;
+    }
+    if (param.numSelectedBoxes <= 256)
+    {
+        tileSize = 256;
+    }
+
+    const dim3 blockSize = {tileSize, 1, 1};
+    const dim3 gridSize = {1, (unsigned int) param.batchSize, 1};
+
+    if (param.boxCoding == 0)
+    {
+        EfficientNMSLandmark<T, BoxCorner<T>><<<gridSize, blockSize, 0, stream>>>(param, topNumData, outputIndexData,
+            outputClassData, sortedIndexData, sortedScoresData, topClassData, topAnchorsData,
+            (BoxCorner<T>*) boxesInput, (T*) landmarksInput, (BoxCorner<T>*) anchorsInput, numDetectionsOutput,
+            nmsScoresOutput, nmsClassesOutput, (BoxCorner<T>*) nmsBoxesOutput, (T*) nmsLandmarksOutput);
+    }
+    else if (param.boxCoding == 1)
+    {
+        // Note that nmsBoxesOutput is always coded as BoxCorner<T>, regardless of the input coding type.
+        EfficientNMSLandmark<T, BoxCenterSize<T>><<<gridSize, blockSize, 0, stream>>>(param, topNumData,
+            outputIndexData, outputClassData, sortedIndexData, sortedScoresData, topClassData, topAnchorsData,
+            (BoxCenterSize<T>*) boxesInput, (T*) landmarksInput, (BoxCenterSize<T>*) anchorsInput, numDetectionsOutput,
+            nmsScoresOutput, nmsClassesOutput, (BoxCorner<T>*) nmsBoxesOutput, (T*) nmsLandmarksOutput);
+    }
+
+    return cudaGetLastError();
+}
+
+__global__ void EfficientNMSLandmarkFilterSegments(EfficientNMSLandmarkParameters param,
+    const int* __restrict__ topNumData, int* __restrict__ topOffsetsStartData, int* __restrict__ topOffsetsEndData)
+{
+    int imageIdx = threadIdx.x;
+    if (imageIdx > param.batchSize)
+    {
+        return;
+    }
+    topOffsetsStartData[imageIdx] = imageIdx * param.numScoreElements;
+    topOffsetsEndData[imageIdx] = imageIdx * param.numScoreElements + topNumData[imageIdx];
+}
+
+template <typename T>
+__global__ void EfficientNMSLandmarkFilter(EfficientNMSLandmarkParameters param, const T* __restrict__ scoresInput,
+    int* __restrict__ topNumData, int* __restrict__ topIndexData, int* __restrict__ topAnchorsData,
+    T* __restrict__ topScoresData, int* __restrict__ topClassData)
+{
+    int elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
+    int imageIdx = blockDim.y * blockIdx.y + threadIdx.y;
+
+    // Boundary Conditions
+    if (elementIdx >= param.numScoreElements || imageIdx >= param.batchSize)
+    {
+        return;
+    }
+
+    // Shape of scoresInput: [batchSize, numAnchors, numClasses]
+    int scoresInputIdx = imageIdx * param.numScoreElements + elementIdx;
+
+    // For each class, check its corresponding score if it crosses the threshold, and if so select this anchor,
+    // and keep track of the maximum score and the corresponding (argmax) class id
+    T score = scoresInput[scoresInputIdx];
+    if (gte_mp(score, (T) param.scoreThreshold))
+    {
+        // Unpack the class and anchor index from the element index
+        int classIdx = elementIdx % param.numClasses;
+        int anchorIdx = elementIdx / param.numClasses;
+
+        // If this is a background class, ignore it.
+        if (classIdx == param.backgroundClass)
+        {
+            return;
+        }
+
+        // Use an atomic to find an open slot where to write the selected anchor data.
+        if (topNumData[imageIdx] >= param.numScoreElements)
+        {
+            return;
+        }
+        int selectedIdx = atomicAdd((unsigned int*) &topNumData[imageIdx], 1);
+        if (selectedIdx >= param.numScoreElements)
+        {
+            topNumData[imageIdx] = param.numScoreElements;
+            return;
+        }
+
+        // Shape of topScoresData / topClassData: [batchSize, numScoreElements]
+        int topIdx = imageIdx * param.numScoreElements + selectedIdx;
+
+        if (param.scoreBits > 0)
+        {
+            score = add_mp(score, (T) 1);
+            if (gt_mp(score, (T) (2.f - 1.f / 1024.f)))
+            {
+                // Ensure the incremented score fits in the mantissa without changing the exponent
+                score = (2.f - 1.f / 1024.f);
+            }
+        }
+
+        topIndexData[topIdx] = selectedIdx;
+        topAnchorsData[topIdx] = anchorIdx;
+        topScoresData[topIdx] = score;
+        topClassData[topIdx] = classIdx;
+    }
+}
+
+template <typename T>
+__global__ void EfficientNMSLandmarkDenseIndex(EfficientNMSLandmarkParameters param, int* __restrict__ topNumData,
+    int* __restrict__ topIndexData, int* __restrict__ topAnchorsData, int* __restrict__ topOffsetsStartData,
+    int* __restrict__ topOffsetsEndData, T* __restrict__ topScoresData, int* __restrict__ topClassData)
+{
+    int elementIdx = blockDim.x * blockIdx.x + threadIdx.x;
+    int imageIdx = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (elementIdx >= param.numScoreElements || imageIdx >= param.batchSize)
+    {
+        return;
+    }
+
+    int dataIdx = imageIdx * param.numScoreElements + elementIdx;
+    int anchorIdx = elementIdx / param.numClasses;
+    int classIdx = elementIdx % param.numClasses;
+    if (param.scoreBits > 0)
+    {
+        T score = topScoresData[dataIdx];
+        if (lt_mp(score, (T) param.scoreThreshold))
+        {
+            score = (T) 1;
+        }
+        else if (classIdx == param.backgroundClass)
+        {
+            score = (T) 1;
+        }
+        else
+        {
+            score = add_mp(score, (T) 1);
+            if (gt_mp(score, (T) (2.f - 1.f / 1024.f)))
+            {
+                // Ensure the incremented score fits in the mantissa without changing the exponent
+                score = (2.f - 1.f / 1024.f);
+            }
+        }
+        topScoresData[dataIdx] = score;
+    }
+    else
+    {
+        T score = topScoresData[dataIdx];
+        if (lt_mp(score, (T) param.scoreThreshold))
+        {
+            topScoresData[dataIdx] = -(1 << 15);
+        }
+        else if (classIdx == param.backgroundClass)
+        {
+            topScoresData[dataIdx] = -(1 << 15);
+        }
+    }
+
+    topIndexData[dataIdx] = elementIdx;
+    topAnchorsData[dataIdx] = anchorIdx;
+    topClassData[dataIdx] = classIdx;
+
+    if (elementIdx == 0)
+    {
+        // Saturate counters
+        topNumData[imageIdx] = param.numScoreElements;
+        topOffsetsStartData[imageIdx] = imageIdx * param.numScoreElements;
+        topOffsetsEndData[imageIdx] = (imageIdx + 1) * param.numScoreElements;
+    }
+}
+
+template <typename T>
+cudaError_t EfficientNMSLandmarkFilterLauncher(EfficientNMSLandmarkParameters& param, const T* scoresInput,
+    int* topNumData, int* topIndexData, int* topAnchorsData, int* topOffsetsStartData, int* topOffsetsEndData,
+    T* topScoresData, int* topClassData, cudaStream_t stream)
+{
+    const unsigned int elementsPerBlock = 512;
+    const unsigned int imagesPerBlock = 1;
+    const unsigned int elementBlocks = (param.numScoreElements + elementsPerBlock - 1) / elementsPerBlock;
+    const unsigned int imageBlocks = (param.batchSize + imagesPerBlock - 1) / imagesPerBlock;
+    const dim3 blockSize = {elementsPerBlock, imagesPerBlock, 1};
+    const dim3 gridSize = {elementBlocks, imageBlocks, 1};
+
+    float kernelSelectThreshold = 0.007f;
+    if (param.scoreSigmoid)
+    {
+        // Inverse Sigmoid
+        if (param.scoreThreshold <= 0.f)
+        {
+            param.scoreThreshold = -(1 << 15);
+        }
+        else
+        {
+            param.scoreThreshold = logf(param.scoreThreshold / (1.f - param.scoreThreshold));
+        }
+        kernelSelectThreshold = logf(kernelSelectThreshold / (1.f - kernelSelectThreshold));
+        // Disable Score Bits Optimization
+        param.scoreBits = -1;
+    }
+
+    if (param.scoreThreshold < kernelSelectThreshold)
+    {
+        // A full copy of the buffer is necessary because sorting will scramble the input data otherwise.
+        PLUGIN_CHECK_CUDA(cudaMemcpyAsync(topScoresData, scoresInput, param.batchSize * param.numScoreElements * sizeof(T),
+            cudaMemcpyDeviceToDevice, stream));
+
+        EfficientNMSLandmarkDenseIndex<T><<<gridSize, blockSize, 0, stream>>>(param, topNumData, topIndexData,
+            topAnchorsData, topOffsetsStartData, topOffsetsEndData, topScoresData, topClassData);
+    }
+    else
+    {
+        EfficientNMSLandmarkFilter<T><<<gridSize, blockSize, 0, stream>>>(
+            param, scoresInput, topNumData, topIndexData, topAnchorsData, topScoresData, topClassData);
+
+        EfficientNMSLandmarkFilterSegments<<<1, param.batchSize, 0, stream>>>(
+            param, topNumData, topOffsetsStartData, topOffsetsEndData);
+    }
+
+    return cudaGetLastError();
+}
+
+template <typename T>
+size_t EfficientNMSLandmarkSortWorkspaceSize(int batchSize, int numScoreElements)
+{
+    size_t sortedWorkspaceSize = 0;
+    cub::DoubleBuffer<T> keysDB(nullptr, nullptr);
+    cub::DoubleBuffer<int> valuesDB(nullptr, nullptr);
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, sortedWorkspaceSize, keysDB, valuesDB, numScoreElements,
+        batchSize, (const int*) nullptr, (const int*) nullptr);
+    return sortedWorkspaceSize;
+}
+
+size_t EfficientNMSLandmarkWorkspaceSize(int batchSize, int numScoreElements, int numClasses, DataType datatype)
+{
+    size_t total = 0;
+    const size_t align = 256;
+    // Counters
+    // 3 for Filtering
+    // 1 for Output Indexing
+    // C for Max per Class Limiting
+    size_t size = (3 + 1 + numClasses) * batchSize * sizeof(int);
+    total += size + (size % align ? align - (size % align) : 0);
+    // Int Buffers
+    for (int i = 0; i < 4; i++)
+    {
+        size = batchSize * numScoreElements * sizeof(int);
+        total += size + (size % align ? align - (size % align) : 0);
+    }
+    // Float Buffers
+    for (int i = 0; i < 2; i++)
+    {
+        size = batchSize * numScoreElements * dataTypeSize(datatype);
+        total += size + (size % align ? align - (size % align) : 0);
+    }
+    // Sort Workspace
+    if (datatype == DataType::kHALF)
+    {
+        size = EfficientNMSLandmarkSortWorkspaceSize<__half>(batchSize, numScoreElements);
+        total += size + (size % align ? align - (size % align) : 0);
+    }
+    else if (datatype == DataType::kFLOAT)
+    {
+        size = EfficientNMSLandmarkSortWorkspaceSize<float>(batchSize, numScoreElements);
+        total += size + (size % align ? align - (size % align) : 0);
+    }
+
+    return total;
+}
+
+template <typename T>
+T* EfficientNMSLandmarkWorkspace(void* workspace, size_t& offset, size_t elements)
+{
+    T* buffer = (T*) ((size_t) workspace + offset);
+    size_t align = 256;
+    size_t size = elements * sizeof(T);
+    size_t sizeAligned = size + (size % align ? align - (size % align) : 0);
+    offset += sizeAligned;
+    return buffer;
+}
+
+template <typename T>
+pluginStatus_t EfficientNMSLandmarkDispatch(EfficientNMSLandmarkParameters param, const void* boxesInput,
+    const void* scoresInput, const void* landmarksInput, const void* anchorsInput, void* numDetectionsOutput,
+    void* nmsBoxesOutput, void* nmsScoresOutput, void* nmsClassesOutput, void* nmsLandmarksOutput, void* workspace,
+    cudaStream_t stream)
+{
+    // Clear Outputs (not all elements will get overwritten by the kernels, so safer to clear everything out)
+    CSC(cudaMemsetAsync(numDetectionsOutput, 0x00, param.batchSize * sizeof(int), stream));
+    CSC(cudaMemsetAsync(nmsScoresOutput, 0x00, param.batchSize * param.numOutputBoxes * sizeof(T), stream));
+    CSC(cudaMemsetAsync(nmsBoxesOutput, 0x00, param.batchSize * param.numOutputBoxes * 4 * sizeof(T), stream));
+    CSC(cudaMemsetAsync(nmsClassesOutput, 0x00, param.batchSize * param.numOutputBoxes * sizeof(int), stream));
+    CSC(cudaMemsetAsync(nmsLandmarksOutput, 0x00, param.batchSize * param.numOutputBoxes * 10 * sizeof(T), stream));
+
+    // Empty Inputs
+    if (param.numScoreElements < 1)
+    {
+        return STATUS_SUCCESS;
+    }
+
+    // Counters Workspace
+    size_t workspaceOffset = 0;
+    int countersTotalSize = (3 + 1 + param.numClasses) * param.batchSize;
+    int* topNumData = EfficientNMSLandmarkWorkspace<int>(workspace, workspaceOffset, countersTotalSize);
+    int* topOffsetsStartData = topNumData + param.batchSize;
+    int* topOffsetsEndData = topNumData + 2 * param.batchSize;
+    int* outputIndexData = topNumData + 3 * param.batchSize;
+    int* outputClassData = topNumData + 4 * param.batchSize;
+    CSC(cudaMemsetAsync(topNumData, 0x00, countersTotalSize * sizeof(int), stream));
+    cudaError_t status = cudaGetLastError();
+    CSC(status, STATUS_FAILURE);
+
+    // Other Buffers Workspace
+    int* topIndexData
+        = EfficientNMSLandmarkWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    int* topClassData
+        = EfficientNMSLandmarkWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    int* topAnchorsData
+        = EfficientNMSLandmarkWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    int* sortedIndexData
+        = EfficientNMSLandmarkWorkspace<int>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    T* topScoresData
+        = EfficientNMSLandmarkWorkspace<T>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    T* sortedScoresData
+        = EfficientNMSLandmarkWorkspace<T>(workspace, workspaceOffset, param.batchSize * param.numScoreElements);
+    size_t sortedWorkspaceSize = EfficientNMSLandmarkSortWorkspaceSize<T>(param.batchSize, param.numScoreElements);
+    char* sortedWorkspaceData = EfficientNMSLandmarkWorkspace<char>(workspace, workspaceOffset, sortedWorkspaceSize);
+    cub::DoubleBuffer<T> scoresDB(topScoresData, sortedScoresData);
+    cub::DoubleBuffer<int> indexDB(topIndexData, sortedIndexData);
+
+    // Device Specific Properties
+    int device;
+    CSC(cudaGetDevice(&device));
+    struct cudaDeviceProp properties;
+    CSC(cudaGetDeviceProperties(&properties, device));
+    if (properties.regsPerBlock >= 65536)
+    {
+        // Most Devices
+        param.numSelectedBoxes = 5000;
+    }
+    else
+    {
+        // Jetson TX1/TX2
+        param.numSelectedBoxes = 2000;
+    }
+
+    // Kernels
+    status = EfficientNMSLandmarkFilterLauncher<T>(param, (T*) scoresInput, topNumData, topIndexData, topAnchorsData,
+        topOffsetsStartData, topOffsetsEndData, topScoresData, topClassData, stream);
+    CSC(status, STATUS_FAILURE);
+
+    status = cub::DeviceSegmentedRadixSort::SortPairsDescending(sortedWorkspaceData, sortedWorkspaceSize, scoresDB,
+        indexDB, param.batchSize * param.numScoreElements, param.batchSize, topOffsetsStartData, topOffsetsEndData,
+        param.scoreBits > 0 ? (10 - param.scoreBits) : 0, param.scoreBits > 0 ? 10 : sizeof(T) * 8, stream, false);
+    CSC(status, STATUS_FAILURE);
+
+    status = EfficientNMSLandmarkLauncher<T>(param, topNumData, outputIndexData, outputClassData, indexDB.Current(),
+        scoresDB.Current(), topClassData, topAnchorsData, boxesInput, landmarksInput, anchorsInput,
+        (int*) numDetectionsOutput, (T*) nmsScoresOutput, (int*) nmsClassesOutput, nmsBoxesOutput, nmsLandmarksOutput,
+        stream);
+    CSC(status, STATUS_FAILURE);
+
+    return STATUS_SUCCESS;
+}
+
+pluginStatus_t EfficientNMSLandmarkInference(EfficientNMSLandmarkParameters param, const void* boxesInput,
+    const void* scoresInput, const void* landmarksInput, const void* anchorsInput, void* numDetectionsOutput,
+    void* nmsBoxesOutput, void* nmsScoresOutput, void* nmsClassesOutput, void* nmsLandmarksOutput, void* workspace,
+    cudaStream_t stream)
+{
+    if (param.datatype == DataType::kFLOAT)
+    {
+        param.scoreBits = -1;
+        return EfficientNMSLandmarkDispatch<float>(param, boxesInput, scoresInput, landmarksInput, anchorsInput,
+            numDetectionsOutput, nmsBoxesOutput, nmsScoresOutput, nmsClassesOutput, nmsLandmarksOutput, workspace,
+            stream);
+    }
+    else if (param.datatype == DataType::kHALF)
+    {
+        if (param.scoreBits <= 0 || param.scoreBits > 10)
+        {
+            param.scoreBits = -1;
+        }
+        return EfficientNMSLandmarkDispatch<__half>(param, boxesInput, scoresInput, landmarksInput, anchorsInput,
+            numDetectionsOutput, nmsBoxesOutput, nmsScoresOutput, nmsClassesOutput, nmsLandmarksOutput, workspace,
+            stream);
+    }
+    else
+    {
+        return STATUS_NOT_SUPPORTED;
+    }
+}
diff --git a/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkInference.cuh b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkInference.cuh
new file mode 100644
index 00000000..ab8f8986
--- /dev/null
+++ b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkInference.cuh
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_EFFICIENT_NMS_LANDMARK_INFERENCE_CUH
+#define TRT_EFFICIENT_NMS_LANDMARK_INFERENCE_CUH
+
+#include <cuda_fp16.h>
+
+// FP32 Intrinsics
+
+float __device__ __inline__ exp_mp(const float a)
+{
+    return __expf(a);
+}
+float __device__ __inline__ sigmoid_mp(const float a)
+{
+    return __frcp_rn(__fadd_rn(1.f, __expf(-a)));
+}
+float __device__ __inline__ add_mp(const float a, const float b)
+{
+    return __fadd_rn(a, b);
+}
+float __device__ __inline__ sub_mp(const float a, const float b)
+{
+    return __fsub_rn(a, b);
+}
+float __device__ __inline__ mul_mp(const float a, const float b)
+{
+    return __fmul_rn(a, b);
+}
+bool __device__ __inline__ gt_mp(const float a, const float b)
+{
+    return a > b;
+}
+bool __device__ __inline__ lt_mp(const float a, const float b)
+{
+    return a < b;
+}
+bool __device__ __inline__ lte_mp(const float a, const float b)
+{
+    return a <= b;
+}
+bool __device__ __inline__ gte_mp(const float a, const float b)
+{
+    return a >= b;
+}
+
+#if __CUDA_ARCH__ >= 530
+
+// FP16 Intrinsics
+
+__half __device__ __inline__ exp_mp(const __half a)
+{
+    return hexp(a);
+}
+__half __device__ __inline__ sigmoid_mp(const __half a)
+{
+    return hrcp(__hadd((__half) 1, hexp(__hneg(a))));
+}
+__half __device__ __inline__ add_mp(const __half a, const __half b)
+{
+    return __hadd(a, b);
+}
+__half __device__ __inline__ sub_mp(const __half a, const __half b)
+{
+    return __hsub(a, b);
+}
+__half __device__ __inline__ mul_mp(const __half a, const __half b)
+{
+    return __hmul(a, b);
+}
+bool __device__ __inline__ gt_mp(const __half a, const __half b)
+{
+    return __hgt(a, b);
+}
+bool __device__ __inline__ lt_mp(const __half a, const __half b)
+{
+    return __hlt(a, b);
+}
+bool __device__ __inline__ lte_mp(const __half a, const __half b)
+{
+    return __hle(a, b);
+}
+bool __device__ __inline__ gte_mp(const __half a, const __half b)
+{
+    return __hge(a, b);
+}
+
+#else
+
+// FP16 Fallbacks on older architectures that lack support
+
+__half __device__ __inline__ exp_mp(const __half a)
+{
+    return __float2half(exp_mp(__half2float(a)));
+}
+__half __device__ __inline__ sigmoid_mp(const __half a)
+{
+    return __float2half(sigmoid_mp(__half2float(a)));
+}
+__half __device__ __inline__ add_mp(const __half a, const __half b)
+{
+    return __float2half(add_mp(__half2float(a), __half2float(b)));
+}
+__half __device__ __inline__ sub_mp(const __half a, const __half b)
+{
+    return __float2half(sub_mp(__half2float(a), __half2float(b)));
+}
+__half __device__ __inline__ mul_mp(const __half a, const __half b)
+{
+    return __float2half(mul_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ gt_mp(const __half a, const __half b)
+{
+    return __float2half(gt_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ lt_mp(const __half a, const __half b)
+{
+    return __float2half(lt_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ lte_mp(const __half a, const __half b)
+{
+    return __float2half(lte_mp(__half2float(a), __half2float(b)));
+}
+bool __device__ __inline__ gte_mp(const __half a, const __half b)
+{
+    return __float2half(gte_mp(__half2float(a), __half2float(b)));
+}
+
+#endif
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCorner;
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCenterSize;
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCorner
+{
+    // For NMS/IOU purposes, YXYX coding is identical to XYXY
+    T y1, x1, y2, x2;
+
+    __device__ void reorder()
+    {
+        if (gt_mp(y1, y2))
+        {
+            // Swap values, so y1 < y2
+            y1 = sub_mp(y1, y2);
+            y2 = add_mp(y1, y2);
+            y1 = sub_mp(y2, y1);
+        }
+        if (gt_mp(x1, x2))
+        {
+            // Swap values, so x1 < x2
+            x1 = sub_mp(x1, x2);
+            x2 = add_mp(x1, x2);
+            x1 = sub_mp(x2, x1);
+        }
+    }
+
+    __device__ BoxCorner<T> clip(T low, T high) const
+    {
+        return {lt_mp(y1, low) ? low : (gt_mp(y1, high) ? high : y1),
+            lt_mp(x1, low) ? low : (gt_mp(x1, high) ? high : x1), lt_mp(y2, low) ? low : (gt_mp(y2, high) ? high : y2),
+            lt_mp(x2, low) ? low : (gt_mp(x2, high) ? high : x2)};
+    }
+
+    __device__ BoxCorner<T> decode(BoxCorner<T> anchor) const
+    {
+        return {add_mp(y1, anchor.y1), add_mp(x1, anchor.x1), add_mp(y2, anchor.y2), add_mp(x2, anchor.x2)};
+    }
+
+    __device__ float area() const
+    {
+        T w = sub_mp(x2, x1);
+        T h = sub_mp(y2, y1);
+        if (lte_mp(h, (T) 0))
+        {
+            return 0;
+        }
+        if (lte_mp(w, (T) 0))
+        {
+            return 0;
+        }
+        return (float) h * (float) w;
+    }
+
+    __device__ operator BoxCenterSize<T>() const
+    {
+        T w = sub_mp(x2, x1);
+        T h = sub_mp(y2, y1);
+        return BoxCenterSize<T>{add_mp(y1, mul_mp((T) 0.5, h)), add_mp(x1, mul_mp((T) 0.5, w)), h, w};
+    }
+
+    __device__ static BoxCorner<T> intersect(BoxCorner<T> a, BoxCorner<T> b)
+    {
+        return {gt_mp(a.y1, b.y1) ? a.y1 : b.y1, gt_mp(a.x1, b.x1) ? a.x1 : b.x1, lt_mp(a.y2, b.y2) ? a.y2 : b.y2,
+            lt_mp(a.x2, b.x2) ? a.x2 : b.x2};
+    }
+};
+
+template <typename T>
+struct __align__(4 * sizeof(T)) BoxCenterSize
+{
+    // For NMS/IOU purposes, YXHW coding is identical to XYWH
+    T y, x, h, w;
+
+    __device__ void reorder() {}
+
+    __device__ BoxCenterSize<T> clip(T low, T high) const
+    {
+        return BoxCenterSize<T>(BoxCorner<T>(*this).clip(low, high));
+    }
+
+    __device__ BoxCenterSize<T> decode(BoxCenterSize<T> anchor) const
+    {
+        return {add_mp(mul_mp(y, anchor.h), anchor.y), add_mp(mul_mp(x, anchor.w), anchor.x),
+            mul_mp(anchor.h, exp_mp(h)), mul_mp(anchor.w, exp_mp(w))};
+    }
+
+    __device__ float area() const
+    {
+        if (h <= (T) 0)
+        {
+            return 0;
+        }
+        if (w <= (T) 0)
+        {
+            return 0;
+        }
+        return (float) h * (float) w;
+    }
+
+    __device__ operator BoxCorner<T>() const
+    {
+        T h2 = mul_mp(h, (T) 0.5);
+        T w2 = mul_mp(w, (T) 0.5);
+        return BoxCorner<T>{sub_mp(y, h2), sub_mp(x, w2), add_mp(y, h2), add_mp(x, w2)};
+    }
+    __device__ static BoxCenterSize<T> intersect(BoxCenterSize<T> a, BoxCenterSize<T> b)
+    {
+        return BoxCenterSize<T>(BoxCorner<T>::intersect(BoxCorner<T>(a), BoxCorner<T>(b)));
+    }
+};
+
+template <typename T>
+struct Landmark
+{
+    T lmk[10];
+};
+
+#endif
diff --git a/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkInference.h b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkInference.h
new file mode 100644
index 00000000..7b66cd35
--- /dev/null
+++ b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkInference.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_EFFICIENT_NMS_LANDMARK_INFERENCE_H
+#define TRT_EFFICIENT_NMS_LANDMARK_INFERENCE_H
+
+#include "common/plugin.h"
+
+#include "efficientNMSLandmarkParameters.h"
+
+size_t EfficientNMSLandmarkWorkspaceSize(int batchSize, int numScoreElements, int numClasses, nvinfer1::DataType datatype);
+
+pluginStatus_t EfficientNMSLandmarkInference(EfficientNMSLandmarkParameters param, const void* boxesInput,
+    const void* scoresInput, const void* landmarksInput, const void* anchorsInput, void* numDetectionsOutput,
+    void* nmsBoxesOutput, void* nmsScoresOutput, void* nmsClassesOutput, void* nmsLandmarksOutput, void* workspace,
+    cudaStream_t stream);
+
+#endif
diff --git a/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkParameters.h b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkParameters.h
new file mode 100644
index 00000000..efa1d18f
--- /dev/null
+++ b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkParameters.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TRT_EFFICIENT_NMS_LANDMARK_PARAMETERS_H
+#define TRT_EFFICIENT_NMS_LANDMARK_PARAMETERS_H
+
+#include "common/plugin.h"
+
+using namespace nvinfer1::plugin;
+namespace nvinfer1
+{
+namespace plugin
+{
+
+struct EfficientNMSLandmarkParameters
+{
+    // Related to NMS Options
+    float iouThreshold = 0.5f;
+    float scoreThreshold = 0.5f;
+    int numOutputBoxes = 100;
+    int numOutputBoxesPerClass = -1;
+    bool padOutputBoxesPerClass = false;
+    int backgroundClass = -1;
+    bool scoreSigmoid = false;
+    bool clipBoxes = false;
+    int boxCoding = 0;
+
+    // Related to NMS Internals
+    int numSelectedBoxes = 4096;
+    int scoreBits = -1;
+
+    // Related to Tensor Configuration
+    // (These are set by the various plugin configuration methods, no need to define them during plugin creation.)
+    int batchSize = -1;
+    int numClasses = 1;
+    int numBoxElements = -1;
+    int numScoreElements = -1;
+    int numAnchors = -1;
+    bool shareLocation = true;
+    bool shareAnchors = true;
+    bool boxDecoder = false;
+    nvinfer1::DataType datatype = nvinfer1::DataType::kFLOAT;
+};
+
+} // namespace plugin
+} // namespace nvinfer1
+
+#endif
diff --git a/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkPlugin.cpp b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkPlugin.cpp
new file mode 100644
index 00000000..bc784249
--- /dev/null
+++ b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkPlugin.cpp
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "efficientNMSLandmarkPlugin.h"
+#include "efficientNMSLandmarkInference.h"
+
+using namespace nvinfer1;
+using nvinfer1::plugin::EfficientNMSLandmarkPlugin;
+using nvinfer1::plugin::EfficientNMSLandmarkParameters;
+using nvinfer1::plugin::EfficientNMSLandmarkPluginCreator;
+
+namespace
+{
+const char* EFFICIENT_NMS_LANDMARK_PLUGIN_VERSION{"1"};
+const char* EFFICIENT_NMS_LANDMARK_PLUGIN_NAME{"EfficientNMSLandmark_TRT"};
+} // namespace
+
+EfficientNMSLandmarkPlugin::EfficientNMSLandmarkPlugin(EfficientNMSLandmarkParameters param)
+    : mParam(param)
+{
+}
+
+EfficientNMSLandmarkPlugin::EfficientNMSLandmarkPlugin(const void* data, size_t length)
+{
+    const char *d = reinterpret_cast<const char*>(data), *a = d;
+    mParam = read<EfficientNMSLandmarkParameters>(d);
+    PLUGIN_VALIDATE(d == a + length);
+}
+
+const char* EfficientNMSLandmarkPlugin::getPluginType() const noexcept
+{
+    return EFFICIENT_NMS_LANDMARK_PLUGIN_NAME;
+}
+
+const char* EfficientNMSLandmarkPlugin::getPluginVersion() const noexcept
+{
+    return EFFICIENT_NMS_LANDMARK_PLUGIN_VERSION;
+}
+
+int EfficientNMSLandmarkPlugin::getNbOutputs() const noexcept
+{
+    // Standard Plugin Implementation
+    return 5;
+}
+
+int EfficientNMSLandmarkPlugin::initialize() noexcept
+{
+    return STATUS_SUCCESS;
+}
+
+void EfficientNMSLandmarkPlugin::terminate() noexcept {}
+
+size_t EfficientNMSLandmarkPlugin::getSerializationSize() const noexcept
+{
+    return sizeof(EfficientNMSLandmarkParameters);
+}
+
+void EfficientNMSLandmarkPlugin::serialize(void* buffer) const noexcept
+{
+    char *d = reinterpret_cast<char*>(buffer), *a = d;
+    write(d, mParam);
+    PLUGIN_VALIDATE(d == a + getSerializationSize());
+}
+
+void EfficientNMSLandmarkPlugin::destroy() noexcept
+{
+    delete this;
+}
+
+void EfficientNMSLandmarkPlugin::setPluginNamespace(const char* pluginNamespace) noexcept
+{
+    try
+    {
+        mNamespace = pluginNamespace;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+}
+
+const char* EfficientNMSLandmarkPlugin::getPluginNamespace() const noexcept
+{
+    return mNamespace.c_str();
+}
+
+nvinfer1::DataType EfficientNMSLandmarkPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const noexcept
+{
+    // On standard NMS, num_detections and detection_classes use integer outputs
+    if (index == 0 || index == 3)
+    {
+        return nvinfer1::DataType::kINT32;
+    }
+    // All others should use the same datatype as the input
+    return inputTypes[0];
+}
+
+IPluginV2DynamicExt* EfficientNMSLandmarkPlugin::clone() const noexcept
+{
+    try
+    {
+        auto* plugin = new EfficientNMSLandmarkPlugin(mParam);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return nullptr;
+}
+
+DimsExprs EfficientNMSLandmarkPlugin::getOutputDimensions(
+    int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder) noexcept
+{
+    try
+    {
+        DimsExprs out_dim;
+
+        // When pad per class is set, the output size may need to be reduced:
+        // i.e.: outputBoxes = min(outputBoxes, outputBoxesPerClass * numClasses)
+        // As the number of classes may not be static, numOutputBoxes must be a dynamic
+        // expression. The corresponding parameter can not be set at this time, so the
+        // value will be calculated again in configurePlugin() and the param overwritten.
+        const IDimensionExpr* numOutputBoxes = exprBuilder.constant(mParam.numOutputBoxes);
+        if (mParam.padOutputBoxesPerClass && mParam.numOutputBoxesPerClass > 0)
+        {
+            const IDimensionExpr* numOutputBoxesPerClass = exprBuilder.constant(mParam.numOutputBoxesPerClass);
+            const IDimensionExpr* numClasses = inputs[1].d[2];
+            numOutputBoxes = exprBuilder.operation(DimensionOperation::kMIN, *numOutputBoxes,
+                *exprBuilder.operation(DimensionOperation::kPROD, *numOutputBoxesPerClass, *numClasses));
+        }
+
+        // Standard NMS
+        PLUGIN_VALIDATE(outputIndex >= 0 && outputIndex <= 4);
+
+        // num_detections
+        if (outputIndex == 0)
+        {
+            out_dim.nbDims = 2;
+            out_dim.d[0] = inputs[0].d[0];
+            out_dim.d[1] = exprBuilder.constant(1);
+        }
+        // detection_boxes
+        else if (outputIndex == 1)
+        {
+            out_dim.nbDims = 3;
+            out_dim.d[0] = inputs[0].d[0];
+            out_dim.d[1] = numOutputBoxes;
+            out_dim.d[2] = exprBuilder.constant(4);
+        }
+        // detection_scores
+        else if (outputIndex == 2)
+        {
+            out_dim.nbDims = 2;
+            out_dim.d[0] = inputs[0].d[0];
+            out_dim.d[1] = numOutputBoxes;
+        }
+        // detection_classes
+        else if (outputIndex == 3)
+        {
+            out_dim.nbDims = 2;
+            out_dim.d[0] = inputs[0].d[0];
+            out_dim.d[1] = numOutputBoxes;
+        }
+        // detection_lmks
+        else if (outputIndex == 4)
+        {
+            out_dim.nbDims = 3;
+            out_dim.d[0] = inputs[0].d[0];
+            out_dim.d[1] = numOutputBoxes;
+            out_dim.d[2] = exprBuilder.constant(10);
+        }
+
+        return out_dim;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return DimsExprs{};
+}
+
+bool EfficientNMSLandmarkPlugin::supportsFormatCombination(
+    int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept
+{
+    if (inOut[pos].format != PluginFormat::kLINEAR)
+    {
+        return false;
+    }
+
+    PLUGIN_VALIDATE(nbInputs == 3 || nbInputs == 4);
+    PLUGIN_VALIDATE(nbOutputs == 5);
+    if (nbInputs == 3)
+    {
+        PLUGIN_VALIDATE(0 <= pos && pos <= 7);
+    }
+    if (nbInputs == 4)
+    {
+        PLUGIN_VALIDATE(0 <= pos && pos <= 8);
+    }
+
+    // num_detections and detection_classes output: int
+    const int posOut = pos - nbInputs;
+    if (posOut == 0 || posOut == 3)
+    {
+        return inOut[pos].type == DataType::kINT32 && inOut[pos].format == PluginFormat::kLINEAR;
+    }
+
+    // all other inputs/outputs: fp32 or fp16
+    return (inOut[pos].type == DataType::kHALF || inOut[pos].type == DataType::kFLOAT)
+        && (inOut[0].type == inOut[pos].type);
+}
+
+void EfficientNMSLandmarkPlugin::configurePlugin(
+    const DynamicPluginTensorDesc* in, int nbInputs, const DynamicPluginTensorDesc* out, int nbOutputs) noexcept
+{
+    try
+    {
+        // Accepts two or three inputs
+        // If two inputs: [0] boxes, [1] scores
+        // If three inputs: [0] boxes, [1] scores, [2] anchors
+        PLUGIN_VALIDATE(nbInputs == 3 || nbInputs == 4);
+        PLUGIN_VALIDATE(nbOutputs == 5);
+        mParam.datatype = in[0].desc.type;
+
+        // Shape of scores input should be
+        // [batch_size, num_boxes, num_classes] or [batch_size, num_boxes, num_classes, 1]
+        PLUGIN_VALIDATE(in[1].desc.dims.nbDims == 3 || (in[1].desc.dims.nbDims == 4 && in[1].desc.dims.d[3] == 1));
+        mParam.numScoreElements = in[1].desc.dims.d[1] * in[1].desc.dims.d[2];
+        mParam.numClasses = in[1].desc.dims.d[2];
+
+        // When pad per class is set, the total ouput boxes size may need to be reduced.
+        // This operation is also done in getOutputDimension(), but for dynamic shapes, the
+        // numOutputBoxes param can't be set until the number of classes is fully known here.
+        if (mParam.padOutputBoxesPerClass && mParam.numOutputBoxesPerClass > 0)
+        {
+            if (mParam.numOutputBoxesPerClass * mParam.numClasses < mParam.numOutputBoxes)
+            {
+                mParam.numOutputBoxes = mParam.numOutputBoxesPerClass * mParam.numClasses;
+            }
+        }
+
+        // Shape of boxes input should be
+        // [batch_size, num_boxes, 4] or [batch_size, num_boxes, 1, 4] or [batch_size, num_boxes, num_classes, 4]
+        PLUGIN_VALIDATE(in[0].desc.dims.nbDims == 3 || in[0].desc.dims.nbDims == 4);
+        if (in[0].desc.dims.nbDims == 3)
+        {
+            PLUGIN_VALIDATE(in[0].desc.dims.d[2] == 4);
+            mParam.shareLocation = true;
+            mParam.numBoxElements = in[0].desc.dims.d[1] * in[0].desc.dims.d[2];
+        }
+        else
+        {
+            mParam.shareLocation = (in[0].desc.dims.d[2] == 1);
+            PLUGIN_VALIDATE(in[0].desc.dims.d[2] == mParam.numClasses || mParam.shareLocation);
+            PLUGIN_VALIDATE(in[0].desc.dims.d[3] == 4);
+            mParam.numBoxElements = in[0].desc.dims.d[1] * in[0].desc.dims.d[2] * in[0].desc.dims.d[3];
+        }
+        mParam.numAnchors = in[0].desc.dims.d[1];
+
+        // Shape of landmarks input should be
+        // [batch_size, num_boxes, 10] or [batch_size, num_boxes, 1, 10]
+        PLUGIN_VALIDATE(in[2].desc.dims.nbDims == 3 || in[2].desc.dims.nbDims == 4);
+        if (in[2].desc.dims.nbDims == 3)
+        {
+            PLUGIN_VALIDATE(in[2].desc.dims.d[2] == 10);
+        }
+        else
+        {
+            PLUGIN_VALIDATE(in[2].desc.dims.d[2] == 1);
+            PLUGIN_VALIDATE(in[2].desc.dims.d[3] == 10);
+        }
+
+        if (nbInputs == 3)
+        {
+            // Only two inputs are used, disable the fused box decoder
+            mParam.boxDecoder = false;
+        }
+        if (nbInputs == 4)
+        {
+            // All three inputs are used, enable the box decoder
+            // Shape of anchors input should be
+            // Constant shape: [1, numAnchors, 4] or [batch_size, numAnchors, 4]
+            PLUGIN_VALIDATE(in[3].desc.dims.nbDims == 3);
+            mParam.boxDecoder = true;
+            mParam.shareAnchors = (in[3].desc.dims.d[0] == 1);
+        }
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+}
+
+size_t EfficientNMSLandmarkPlugin::getWorkspaceSize(
+    const PluginTensorDesc* inputs, int nbInputs, const PluginTensorDesc* outputs, int nbOutputs) const noexcept
+{
+    int batchSize = inputs[1].dims.d[0];
+    int numScoreElements = inputs[1].dims.d[1] * inputs[1].dims.d[2];
+    int numClasses = inputs[1].dims.d[2];
+    return EfficientNMSLandmarkWorkspaceSize(batchSize, numScoreElements, numClasses, mParam.datatype);
+}
+
+int EfficientNMSLandmarkPlugin::enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc,
+    const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept
+{
+    try
+    {
+        mParam.batchSize = inputDesc[0].dims.d[0];
+
+        // Standard NMS Operation
+        const void* const boxesInput = inputs[0];
+        const void* const scoresInput = inputs[1];
+        const void* const landmarksInput = inputs[2];
+        const void* const anchorsInput = mParam.boxDecoder ? inputs[3] : nullptr;
+
+        void* numDetectionsOutput = outputs[0];
+        void* nmsBoxesOutput = outputs[1];
+        void* nmsScoresOutput = outputs[2];
+        void* nmsClassesOutput = outputs[3];
+        void* nmsLandmarksOutput = outputs[4];
+
+        return EfficientNMSLandmarkInference(mParam, boxesInput, scoresInput, landmarksInput, anchorsInput,
+            numDetectionsOutput, nmsBoxesOutput, nmsScoresOutput, nmsClassesOutput, nmsLandmarksOutput, workspace,
+            stream);
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return -1;
+}
+
+// Standard NMS Plugin Operation
+
+EfficientNMSLandmarkPluginCreator::EfficientNMSLandmarkPluginCreator()
+    : mParam{}
+{
+    mPluginAttributes.clear();
+    mPluginAttributes.emplace_back(PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+    mPluginAttributes.emplace_back(PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+    mPluginAttributes.emplace_back(PluginField("max_output_boxes", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("background_class", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("score_activation", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("box_coding", nullptr, PluginFieldType::kINT32, 1));
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+const char* EfficientNMSLandmarkPluginCreator::getPluginName() const noexcept
+{
+    return EFFICIENT_NMS_LANDMARK_PLUGIN_NAME;
+}
+
+const char* EfficientNMSLandmarkPluginCreator::getPluginVersion() const noexcept
+{
+    return EFFICIENT_NMS_LANDMARK_PLUGIN_VERSION;
+}
+
+const PluginFieldCollection* EfficientNMSLandmarkPluginCreator::getFieldNames() noexcept
+{
+    return &mFC;
+}
+
+IPluginV2DynamicExt* EfficientNMSLandmarkPluginCreator::createPlugin(
+    const char* name, const PluginFieldCollection* fc) noexcept
+{
+    try
+    {
+        const PluginField* fields = fc->fields;
+        for (int i = 0; i < fc->nbFields; ++i)
+        {
+            const char* attrName = fields[i].name;
+            if (!strcmp(attrName, "score_threshold"))
+            {
+                PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kFLOAT32);
+                mParam.scoreThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            if (!strcmp(attrName, "iou_threshold"))
+            {
+                PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kFLOAT32);
+                mParam.iouThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            if (!strcmp(attrName, "max_output_boxes"))
+            {
+                PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+                mParam.numOutputBoxes = *(static_cast<const int*>(fields[i].data));
+            }
+            if (!strcmp(attrName, "background_class"))
+            {
+                PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+                mParam.backgroundClass = *(static_cast<const int*>(fields[i].data));
+            }
+            if (!strcmp(attrName, "score_activation"))
+            {
+                auto scoreSigmoid = *(static_cast<const int32_t*>(fields[i].data));
+                PLUGIN_VALIDATE(scoreSigmoid == 0 || scoreSigmoid == 1);
+                mParam.scoreSigmoid = static_cast<bool>(scoreSigmoid);
+            }
+            if (!strcmp(attrName, "box_coding"))
+            {
+                PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+                mParam.boxCoding = *(static_cast<const int*>(fields[i].data));
+            }
+        }
+
+        auto* plugin = new EfficientNMSLandmarkPlugin(mParam);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return nullptr;
+}
+
+IPluginV2DynamicExt* EfficientNMSLandmarkPluginCreator::deserializePlugin(
+    const char* name, const void* serialData, size_t serialLength) noexcept
+{
+    try
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call EfficientNMSLandmarkPlugin::destroy()
+        auto* plugin = new EfficientNMSLandmarkPlugin(serialData, serialLength);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+    catch (const std::exception& e)
+    {
+        caughtError(e);
+    }
+    return nullptr;
+}
diff --git a/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkPlugin.h b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkPlugin.h
new file mode 100644
index 00000000..fd069f6c
--- /dev/null
+++ b/plugin/efficientNMSLandmarkPlugin/efficientNMSLandmarkPlugin.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef TRT_EFFICIENT_NMS_LANDMARK_PLUGIN_H
+#define TRT_EFFICIENT_NMS_LANDMARK_PLUGIN_H
+
+#include <vector>
+
+#include "common/plugin.h"
+#include "efficientNMSLandmarkParameters.h"
+
+using namespace nvinfer1::plugin;
+namespace nvinfer1
+{
+namespace plugin
+{
+
+class EfficientNMSLandmarkPlugin : public IPluginV2DynamicExt
+{
+public:
+    explicit EfficientNMSLandmarkPlugin(EfficientNMSLandmarkParameters param);
+    EfficientNMSLandmarkPlugin(const void* data, size_t length);
+    ~EfficientNMSLandmarkPlugin() override = default;
+
+    // IPluginV2 methods
+    const char* getPluginType() const noexcept override;
+    const char* getPluginVersion() const noexcept override;
+    int getNbOutputs() const noexcept override;
+    int initialize() noexcept override;
+    void terminate() noexcept override;
+    size_t getSerializationSize() const noexcept override;
+    void serialize(void* buffer) const noexcept override;
+    void destroy() noexcept override;
+    void setPluginNamespace(const char* libNamespace) noexcept override;
+    const char* getPluginNamespace() const noexcept override;
+
+    // IPluginV2Ext methods
+    nvinfer1::DataType getOutputDataType(
+        int index, const nvinfer1::DataType* inputType, int nbInputs) const noexcept override;
+
+    // IPluginV2DynamicExt methods
+    IPluginV2DynamicExt* clone() const noexcept override;
+    DimsExprs getOutputDimensions(
+        int outputIndex, const DimsExprs* inputs, int nbInputs, IExprBuilder& exprBuilder) noexcept override;
+    bool supportsFormatCombination(
+        int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) noexcept override;
+    void configurePlugin(const DynamicPluginTensorDesc* in, int nbInputs, const DynamicPluginTensorDesc* out,
+        int nbOutputs) noexcept override;
+    size_t getWorkspaceSize(const PluginTensorDesc* inputs, int nbInputs, const PluginTensorDesc* outputs,
+        int nbOutputs) const noexcept override;
+    int enqueue(const PluginTensorDesc* inputDesc, const PluginTensorDesc* outputDesc, const void* const* inputs,
+        void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
+
+protected:
+    EfficientNMSLandmarkParameters mParam{};
+    std::string mNamespace;
+};
+
+// Standard NMS Plugin Operation
+class EfficientNMSLandmarkPluginCreator : public nvinfer1::pluginInternal::BaseCreator
+{
+public:
+    EfficientNMSLandmarkPluginCreator();
+    ~EfficientNMSLandmarkPluginCreator() override = default;
+
+    const char* getPluginName() const noexcept override;
+    const char* getPluginVersion() const noexcept override;
+    const PluginFieldCollection* getFieldNames() noexcept override;
+
+    IPluginV2DynamicExt* createPlugin(const char* name, const PluginFieldCollection* fc) noexcept override;
+    IPluginV2DynamicExt* deserializePlugin(
+        const char* name, const void* serialData, size_t serialLength) noexcept override;
+
+protected:
+    PluginFieldCollection mFC;
+    EfficientNMSLandmarkParameters mParam;
+    std::vector<PluginField> mPluginAttributes;
+    std::string mPluginName;
+};
+
+} // namespace plugin
+} // namespace nvinfer1
+
+#endif // TRT_EFFICIENT_NMS_LANDMARK_PLUGIN_H
diff --git a/plugin/roIAlign2Plugin/CMakeLists.txt b/plugin/roIAlign2Plugin/CMakeLists.txt
new file mode 100644
index 00000000..53ddf941
--- /dev/null
+++ b/plugin/roIAlign2Plugin/CMakeLists.txt
@@ -0,0 +1,19 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB SRCS *.cpp)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
diff --git a/plugin/roIAlign2Plugin/README.md b/plugin/roIAlign2Plugin/README.md
new file mode 100644
index 00000000..3a66a0c0
--- /dev/null
+++ b/plugin/roIAlign2Plugin/README.md
@@ -0,0 +1,102 @@
+# RoIAlign2Plugin
+
+**Table Of Contents**
+- [Changelog](#changelog)
+- [Description](#description)
+- [Structure](#structure)
+- [Parameters](#parameters)
+- [Compatibility Modes](#compatibility-modes)
+- [Additional Resources](#additional-resources)
+- [License](#license)
+- [Known issues](#known-issues)
+
+## Changelog
+
+February 2022
+Major refactoring of the plugin to add new features and compatibility modes.
+
+June 2019
+This is the first release of this `README.md` file.
+
+## Description
+
+The `RoIAlignP2lugin` plugin performs the ROIAlign operations on the output feature maps of an FPN (Feature Pyramid Network). This is used in many implementations of FasterRCNN and MaskRCNN. This operation is also known as ROIPooling.
+
+## Structure
+
+#### Inputs
+
+This plugin works in NCHW format. It takes five input tensors:
+
+- `feature_map` with shape `[N, C, 256, 256]`, usually corresponds to `P2`.
+
+`rois` is the proposal ROI coordinates, these usually come from a Proposals layer or an NMS operation, such as the [EfficientNMS](https://github.com/NVIDIA/TensorRT/tree/main/plugin/efficientNMSPlugin) plugin. Its shape is `[N, R, 4]` where `N` is the batch_size, `R` is the number of ROI candidates and `4` is the number of coordinates.
+
+
+#### Outputs
+
+This plugin generates one output tensor of shape `[N, R, C, pooled_size, pooled_size]` where `C` is the same number of channels as the feature maps, and `pooled_size` is the configured height (and width) of the feature area after ROIAlign.
+
+## Parameters
+
+This plugin has the plugin creator class `RoIAlignPluginPluginCreator` and the plugin class `RoIAlignPl2ugin`.
+
+The following parameters are used to create a `RoIAlignP2lugin` instance:
+
+| Type    | Parameter              | Default    | Description
+|---------|------------------------|------------|--------------------------------------------------------
+| `int`   | `pooled_size`          | 7          | The spatial size of a feature area after ROIAlgin will be `[pooled_size, pooled_size]`
+| `int[]` | `image_size`           | 1024,1024  | An 2-element array with the input image size of the entire network, in `[image_height, image_width]` layout
+| `int`   | `sampling_ratio`       | 0          | If set to 1 or larger, the number of samples to take for each output element. If set to 0, this will be calculated adaptively by the size of the ROI.
+| `int`   | `roi_coords_absolute`  | 1          | If set to 0, the ROIs are normalized in [0-1] range. If set to 1, the ROIs are in full image space.
+| `int`   | `roi_coords_swap`      | 0          | If set to 0, the ROIs are in `[x1,y1,x2,y2]` format (PyTorch standard). If set to 1, they are in `[y1,x1,y2,x2]` format (TensorFlow standard).
+| `int`   | `roi_coords_transform` | 2          | The coordinate transformation method to use for the ROI Align operation. If set to 2, `half_pixel` sampling will be performed. If set to 1, `output_half_pixel` will be performed. If set to 0, no pixel offset will be applied. More details on compatibility modes below.
+
+## Compatibility Modes
+
+There exist many implementations of FasterRCNN and MaskRCNN, and unfortunately, there is no consensus on a canonical way to execute the ROI Pooling of an FPN. This plugin attempts to support multiple common implementations, configurable via the various parameters that have been exposed.
+
+#### Detectron 2
+
+To replicate the standard ROI Pooling behavior of [Detectron 2](https://github.com/facebookresearch/detectron2), set the parameters as follows:
+
+- `roi_coords_transform`: 2. This implementation uses half_pixel coordinate offsets.
+- `roi_coords_swap`: 0. This implementation follows the PyTorch standard for coordinate layout.
+- `roi_coords_absolute`: 1. This implementation works will full-size ROI coordinates.
+- `sampling_ratio`: 0. This implementation uses an adaptive sampling ratio determined from each ROI area.
+
+#### MaskRCNN Benchmark
+
+To replicate the standard ROI Pooling behavior of [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark), set the parameters as follows:
+
+- `roi_coords_transform`: 1. This implementation uses output_half_pixel coordinate offsets.
+- `roi_coords_swap`: 0. This implementation follows the PyTorch standard for coordinate layout.
+- `roi_coords_absolute`: 1. This implementation works will full-size ROI coordinates.
+- `sampling_ratio`: 2. This implementation performs two samples per output element.
+
+#### Other Implementations
+
+Other FPN ROI Pooling implementations may be adapted by having a better understanding of how the various parameters work internally.
+
+**Coordinate Transformation**: This flag primarily defines various offsets applied to coordinates when performing the bilinear interpolation sampling for ROI Align. The three supported values work as follows:
+- `roi_coords_transform` = -1: This is a back-compatibility that calculates the scale by subtracting one to both the input and output dimensions. This is similar to the `align_corners` resize method.
+- `roi_coords_transform` = 0: This is a naive implementation where no pixel offset is applied anywhere. It is similar to the `asymmetric` resize method.
+- `roi_coords_transform` = 1: This performs half pixel offset by applying a 0.5 offset only in the output element sampling. This is similar to the `output_half_pixel` ROI Align method.
+- `roi_coords_transform` = 2: This performs half pixel offset by applying a 0.5 offset in the output element sampling, but also to the input map coordinate. This is similar to the `half_pixel` ROI Align method, and is the favored method of performing ROI Align.
+
+## Additional Resources
+
+The following resources provide a deeper understanding of the `RoIAlignP2lugin` plugin:
+
+- [MaskRCNN](https://github.com/matterport/Mask_RCNN)
+- [FPN](https://arxiv.org/abs/1612.03144)
+
+
+## License
+
+For terms and conditions for use, reproduction, and distribution, see the [TensorRT Software License Agreement](https://docs.nvidia.com/deeplearning/sdk/tensorrt-sla/index.html) documentation.
+
+
+## Known issues
+
+There are no known issues in this plugin.
diff --git a/plugin/roIAlign2Plugin/roIAlign2Plugin.cpp b/plugin/roIAlign2Plugin/roIAlign2Plugin.cpp
new file mode 100644
index 00000000..fddcb5b3
--- /dev/null
+++ b/plugin/roIAlign2Plugin/roIAlign2Plugin.cpp
@@ -0,0 +1,395 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "roIAlign2Plugin.h"
+#include "NvInfer.h"
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <stdio.h>
+#include <vector>
+
+using namespace nvinfer1;
+using nvinfer1::plugin::RoIAlign2DynamicPlugin;
+using nvinfer1::plugin::RoIAlign2BasePluginCreator;
+using nvinfer1::plugin::RoIAlign2DynamicPluginCreator;
+
+// plugin specific constants
+namespace
+{
+static char const* ROIALIGN2_PLUGIN_VERSION{"1"};
+static char const* ROIALIGN2_PLUGIN_NAME{"RoIAlign2Dynamic_TRT"};
+} // namespace
+
+// Static class fields initialization
+PluginFieldCollection RoIAlign2BasePluginCreator::mFC{};
+std::vector<PluginField> RoIAlign2BasePluginCreator::mPluginAttributes;
+
+// Helper function for serializing plugin
+template <typename T>
+void writeToBuffer(char*& buffer, const T& val)
+{
+    *reinterpret_cast<T*>(buffer) = val;
+    buffer += sizeof(T);
+}
+
+// Helper function for deserializing plugin
+template <typename T>
+T readFromBuffer(char const*& buffer)
+{
+    T val = *reinterpret_cast<const T*>(buffer);
+    buffer += sizeof(T);
+    return val;
+}
+
+RoIAlign2DynamicPlugin::RoIAlign2DynamicPlugin(std::string const name, int pooledSize, int transformCoords,
+    bool absCoords, bool swapCoords, int samplingRatio, bool legacy, int imageSize)
+    : mLayerName(name)
+    , mPooledSize(pooledSize)
+    , mImageSize(imageSize)
+    , mTransformCoords(transformCoords)
+    , mAbsCoords(absCoords)
+    , mSwapCoords(swapCoords)
+    , mSamplingRatio(samplingRatio)
+    , mIsLegacy(legacy)
+{
+    PLUGIN_VALIDATE(pooledSize >= 1);
+    PLUGIN_VALIDATE(samplingRatio >= 0);
+}
+
+RoIAlign2DynamicPlugin::RoIAlign2DynamicPlugin(std::string const name, int pooledSize, int transformCoords,
+    bool absCoords, bool swapCoords, int samplingRatio, bool legacy, int imageSize, int featureLength, int roiCount,
+    int inputWidth, int inputHeight)
+    : mLayerName(name)
+    , mPooledSize(pooledSize)
+    , mImageSize(imageSize)
+    , mTransformCoords(transformCoords)
+    , mAbsCoords(absCoords)
+    , mSwapCoords(swapCoords)
+    , mSamplingRatio(samplingRatio)
+    , mIsLegacy(legacy)
+    , mFeatureLength(featureLength)
+    , mROICount(roiCount)
+    , mInputWidth(inputWidth)
+    , mInputHeight(inputHeight)
+{
+    PLUGIN_VALIDATE(pooledSize >= 1);
+    PLUGIN_VALIDATE(samplingRatio >= 0);
+}
+
+RoIAlign2DynamicPlugin::RoIAlign2DynamicPlugin(std::string const name, void const* data, size_t length)
+    : mLayerName(name)
+{
+    char const *d = reinterpret_cast<char const*>(data), *a = d;
+    mPooledSize = readFromBuffer<int>(d);
+    mImageSize = readFromBuffer<int>(d);
+    mTransformCoords = readFromBuffer<int>(d);
+    mAbsCoords = readFromBuffer<int>(d);
+    mSwapCoords = readFromBuffer<int>(d);
+    mSamplingRatio = readFromBuffer<int>(d);
+    mIsLegacy = readFromBuffer<int>(d);
+    mFeatureLength = readFromBuffer<int>(d);
+    mROICount = readFromBuffer<int>(d);
+    mInputWidth = readFromBuffer<int>(d);
+    mInputHeight = readFromBuffer<int>(d);
+    PLUGIN_VALIDATE(d == a + length);
+}
+
+RoIAlign2DynamicPlugin::~RoIAlign2DynamicPlugin() noexcept {}
+
+char const* RoIAlign2DynamicPlugin::getPluginType() const noexcept
+{
+    return ROIALIGN2_PLUGIN_NAME;
+}
+
+char const* RoIAlign2DynamicPlugin::getPluginVersion() const noexcept
+{
+    return ROIALIGN2_PLUGIN_VERSION;
+}
+
+int RoIAlign2DynamicPlugin::getNbOutputs() const noexcept
+{
+    return 1;
+}
+
+DimsExprs RoIAlign2DynamicPlugin::getOutputDimensions(
+    int outputIndex, DimsExprs const* inputs, int nbInputs, IExprBuilder& exprBuilder) noexcept
+{
+    // Validate input arguments
+    PLUGIN_VALIDATE(outputIndex == 0);
+    PLUGIN_VALIDATE(nbInputs == 2);
+
+    // Shape of feature_map input should be
+    // Constant shape: [batch_size, C, W, H] or Dynamic shape: some dimension values may be -1
+    PLUGIN_VALIDATE(inputs[0].nbDims == 4);
+
+    // Shape of roi input should be
+    // Constant shape: [batch_size, R, 4] or Dynamic shape: some dimension values may be -1
+    PLUGIN_VALIDATE(inputs[1].nbDims == 3);
+
+    DimsExprs out_dim;
+    out_dim.nbDims = 5;
+    out_dim.d[0] = inputs[0].d[0];
+    // roiCount
+    out_dim.d[1] = inputs[1].d[1];
+    // featureLength
+    out_dim.d[2] = inputs[0].d[1];
+    // height
+    out_dim.d[3] = exprBuilder.constant(mPooledSize);
+    // width
+    out_dim.d[4] = exprBuilder.constant(mPooledSize);
+    return out_dim;
+}
+
+int RoIAlign2DynamicPlugin::initialize() noexcept
+{
+    return STATUS_SUCCESS;
+}
+
+size_t RoIAlign2DynamicPlugin::getWorkspaceSize(
+    PluginTensorDesc const* inputs, int nbInputs, PluginTensorDesc const* outputs, int nbOutputs) const noexcept
+{
+    return 0;
+}
+
+int RoIAlign2DynamicPlugin::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
+    void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept
+{
+    int batchSize = inputDesc[0].dims.d[0];
+
+    xy_t const layerDims = {mInputHeight, mInputWidth};
+
+    void const* feat = inputs[0];
+    void const* rois = inputs[1];
+    void* output = outputs[0];
+
+    cudaError_t status;
+
+    // Support legacy UFF mode
+    if (mIsLegacy)
+    {
+        // Legacy values
+        mTransformCoords = -1;
+        mSwapCoords = true;
+        mAbsCoords = false;
+        mSamplingRatio = 1;
+        status = roiAlign(stream, batchSize, mImageSize, mFeatureLength, mROICount, mTransformCoords, mAbsCoords,
+            mSwapCoords, mSamplingRatio, rois, feat, layerDims, output, mPooledSize);
+    }
+    else
+    {
+        status = roiAlign(stream, batchSize, mImageSize, mFeatureLength, mROICount, mTransformCoords, mAbsCoords,
+            mSwapCoords, mSamplingRatio, rois, feat, layerDims, output, mPooledSize);
+    }
+    PLUGIN_VALIDATE(status == cudaSuccess)
+    return status;
+}
+
+size_t RoIAlign2DynamicPlugin::getSerializationSize() const noexcept
+{
+    return 11 * sizeof(int);
+}
+
+void RoIAlign2DynamicPlugin::serialize(void* buffer) const noexcept
+{
+    char *d = reinterpret_cast<char*>(buffer), *a = d;
+    writeToBuffer<int>(d, mPooledSize);
+    writeToBuffer<int>(d, mImageSize);
+    writeToBuffer<int>(d, mTransformCoords);
+    writeToBuffer<int>(d, mAbsCoords);
+    writeToBuffer<int>(d, mSwapCoords);
+    writeToBuffer<int>(d, mSamplingRatio);
+    writeToBuffer<int>(d, mIsLegacy);
+    writeToBuffer<int>(d, mFeatureLength);
+    writeToBuffer<int>(d, mROICount);
+    writeToBuffer<int>(d, mInputWidth);
+    writeToBuffer<int>(d, mInputHeight);
+    PLUGIN_VALIDATE(d == a + getSerializationSize());
+}
+
+bool RoIAlign2DynamicPlugin::supportsFormatCombination(
+    int pos, PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept
+{
+    // 2 inputs, 1 outputs, so 3 input/output in total
+    PLUGIN_VALIDATE(0 <= pos && pos < 3);
+    auto const* in = inOut;
+    auto const* out = inOut + nbInputs;
+    bool const consistentFloatPrecision = (in[0].type == in[pos].type);
+    switch (pos)
+    {
+    case 0: return in[0].type == DataType::kFLOAT && in[0].format == PluginFormat::kLINEAR && consistentFloatPrecision;
+    case 1: return in[1].type == DataType::kFLOAT && in[1].format == PluginFormat::kLINEAR && consistentFloatPrecision;
+    case 2:
+        return out[0].type == DataType::kFLOAT && out[0].format == PluginFormat::kLINEAR && consistentFloatPrecision;
+    }
+    return false;
+}
+
+void RoIAlign2DynamicPlugin::terminate() noexcept {}
+
+void RoIAlign2DynamicPlugin::destroy() noexcept
+{
+    // This gets called when the network containing plugin is destroyed
+    delete this;
+}
+
+IPluginV2DynamicExt* RoIAlign2DynamicPlugin::clone() const noexcept
+{
+    auto plugin = new RoIAlign2DynamicPlugin(mLayerName, mPooledSize, mTransformCoords, mAbsCoords, mSwapCoords,
+        mSamplingRatio, mIsLegacy, mImageSize, mFeatureLength, mROICount, mInputWidth, mInputHeight);
+    plugin->setPluginNamespace(mNamespace.c_str());
+    return plugin;
+}
+
+void RoIAlign2DynamicPlugin::setPluginNamespace(char const* libNamespace) noexcept
+{
+    mNamespace = libNamespace;
+}
+
+char const* RoIAlign2DynamicPlugin::getPluginNamespace() const noexcept
+{
+    return mNamespace.c_str();
+}
+
+DataType RoIAlign2DynamicPlugin::getOutputDataType(
+    int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept
+{
+    PLUGIN_VALIDATE(index == 0);
+    return inputTypes[0];
+}
+
+void RoIAlign2DynamicPlugin::configurePlugin(
+    DynamicPluginTensorDesc const* in, int nbInputs, DynamicPluginTensorDesc const* out, int nbOutputs) noexcept
+{
+    PLUGIN_VALIDATE(nbInputs == 2);
+    PLUGIN_VALIDATE(nbOutputs == 1);
+
+    mFeatureLength = in[0].desc.dims.d[1];
+    mInputHeight = in[0].desc.dims.d[2];
+    mInputWidth = in[0].desc.dims.d[3];
+    mROICount = in[1].desc.dims.d[1];
+}
+
+RoIAlign2BasePluginCreator::RoIAlign2BasePluginCreator() noexcept
+{
+    mPluginAttributes.clear();
+
+    mPluginAttributes.emplace_back(PluginField("pooled_size", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("image_size", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("roi_coords_absolute", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("roi_coords_swap", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("roi_coords_transform", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("sampling_ratio", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("legacy", nullptr, PluginFieldType::kINT32, 1));
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+RoIAlign2DynamicPluginCreator::RoIAlign2DynamicPluginCreator() noexcept
+{
+    mPluginName = ROIALIGN2_PLUGIN_NAME;
+}
+
+char const* RoIAlign2BasePluginCreator::getPluginName() const noexcept
+{
+    return mPluginName.c_str();
+}
+
+char const* RoIAlign2BasePluginCreator::getPluginVersion() const noexcept
+{
+    return ROIALIGN2_PLUGIN_VERSION;
+}
+
+PluginFieldCollection const* RoIAlign2BasePluginCreator::getFieldNames() noexcept
+{
+    return &mFC;
+}
+
+IPluginV2DynamicExt* RoIAlign2DynamicPluginCreator::createPlugin(
+    char const* name, PluginFieldCollection const* fc) noexcept
+{
+    PluginField const* fields = fc->fields;
+    int nbFields = fc->nbFields;
+
+    // Default values for the plugin creator, these will be used when the corresponding
+    // plugin field is not passed, allowing to have defaults for "optional" ONNX attributes.
+    int pooledSize = 7;
+    int transformCoords = 2;
+    bool absCoords = true;
+    bool swapCoords = false;
+    bool legacy = false;
+    int samplingRatio = 0;
+    int imageSize = 640;
+
+    for (int i = 0; i < nbFields; ++i)
+    {
+        char const* attrName = fields[i].name;
+
+        if (!strcmp(attrName, "pooled_size"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            pooledSize = *(static_cast<int const*>(fields[i].data));
+            PLUGIN_VALIDATE(pooledSize >= 1);
+        }
+        if (!strcmp(attrName, "image_size"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            imageSize = *static_cast<int const*>(fields[i].data);
+        }
+        if (!strcmp(attrName, "roi_coords_absolute"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            absCoords = *(static_cast<int const*>(fields[i].data));
+        }
+        if (!strcmp(attrName, "roi_coords_swap"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            swapCoords = *(static_cast<int const*>(fields[i].data));
+        }
+        if (!strcmp(attrName, "roi_coords_transform"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            transformCoords = *(static_cast<int const*>(fields[i].data));
+        }
+        if (!strcmp(attrName, "sampling_ratio"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            samplingRatio = *(static_cast<int const*>(fields[i].data));
+            PLUGIN_VALIDATE(samplingRatio >= 0);
+        }
+        if (!strcmp(attrName, "legacy"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            legacy = *(static_cast<int const*>(fields[i].data));
+        }
+    }
+
+    IPluginV2DynamicExt* plugin = new RoIAlign2DynamicPlugin(
+        name, pooledSize, transformCoords, absCoords, swapCoords, samplingRatio, legacy, imageSize);
+    plugin->setPluginNamespace(mNamespace.c_str());
+    return plugin;
+}
+
+IPluginV2DynamicExt* RoIAlign2DynamicPluginCreator::deserializePlugin(
+    char const* name, void const* serialData, size_t serialLength) noexcept
+{
+    // This object will be deleted when the network is destroyed,
+    IPluginV2DynamicExt* plugin = new RoIAlign2DynamicPlugin(name, serialData, serialLength);
+    plugin->setPluginNamespace(mNamespace.c_str());
+    return plugin;
+}
diff --git a/plugin/roIAlign2Plugin/roIAlign2Plugin.h b/plugin/roIAlign2Plugin/roIAlign2Plugin.h
new file mode 100644
index 00000000..d7507d16
--- /dev/null
+++ b/plugin/roIAlign2Plugin/roIAlign2Plugin.h
@@ -0,0 +1,143 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef ROIALIGN2_PLUGIN_H
+#define ROIALIGN2_PLUGIN_H
+
+#include <cassert>
+#include <cuda_runtime_api.h>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+#include "common/kernel.h"
+#include "common/kernels/maskRCNNKernels.h"
+#include "common/mrcnn_config.h"
+#include "common/plugin.h"
+
+using namespace nvinfer1::plugin;
+
+// One of the preferred ways of making TensorRT to be able to see
+// our custom layer requires extending IPluginV2Ext and BaseCreator classes.
+// For requirements for overriden functions, check TensorRT API docs.
+namespace nvinfer1
+{
+namespace plugin
+{
+
+class RoIAlign2DynamicPlugin : public IPluginV2DynamicExt
+{
+public:
+    RoIAlign2DynamicPlugin(std::string const name);
+
+    RoIAlign2DynamicPlugin(std::string const name, int pooledSize, int transformCoords, bool absCoords, bool swapCoords,
+        int samplingRatio, bool legacy, int imageSize);
+
+    RoIAlign2DynamicPlugin(std::string const name, int pooledSize, int transformCoords, bool absCoords, bool swapCoords,
+        int samplingRatio, bool legacy, int imageSize, int featureLength, int roiCount, int inputWidth, int inputHeight);
+
+    RoIAlign2DynamicPlugin(std::string const name, void const* data, size_t length);
+
+    // It doesn't make sense to make RoIAlign2DynamicPlugin without arguments, so we delete default constructor.
+    RoIAlign2DynamicPlugin() noexcept = delete;
+
+    ~RoIAlign2DynamicPlugin() noexcept override;
+
+    // IPluginV2 methods
+    char const* getPluginType() const noexcept override;
+    char const* getPluginVersion() const noexcept override;
+    int getNbOutputs() const noexcept override;
+    int initialize() noexcept override;
+    void terminate() noexcept override;
+    size_t getSerializationSize() const noexcept override;
+    void serialize(void* buffer) const noexcept override;
+    void destroy() noexcept override;
+    void setPluginNamespace(char const* libNamespace) noexcept override;
+    char const* getPluginNamespace() const noexcept override;
+
+    // IPluginV2Ext methods
+    DataType getOutputDataType(int index, nvinfer1::DataType const* inputType, int nbInputs) const noexcept override;
+
+    // IPluginV2DynamicExt methods
+    IPluginV2DynamicExt* clone() const noexcept override;
+    DimsExprs getOutputDimensions(
+        int outputIndex, DimsExprs const* inputs, int nbInputs, IExprBuilder& exprBuilder) noexcept override;
+    bool supportsFormatCombination(
+        int pos, PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept override;
+    void configurePlugin(DynamicPluginTensorDesc const* in, int nbInputs, DynamicPluginTensorDesc const* out,
+        int nbOutputs) noexcept override;
+    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int nbInputs, PluginTensorDesc const* outputs,
+        int nbOutputs) const noexcept override;
+    int enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
+        void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
+
+private:
+    std::string const mLayerName;
+    std::string mNamespace;
+    int mPooledSize;
+    int mImageSize;
+    int mTransformCoords;
+    bool mAbsCoords;
+    bool mSwapCoords;
+    int mSamplingRatio;
+    bool mIsLegacy{false};
+    int mFeatureLength;
+    int mROICount;
+    int mInputWidth;
+    int mInputHeight;
+};
+
+class RoIAlign2BasePluginCreator : public nvinfer1::pluginInternal::BaseCreator
+{
+public:
+    RoIAlign2BasePluginCreator() noexcept;
+    ~RoIAlign2BasePluginCreator() noexcept override = default;
+    char const* getPluginName() const noexcept override;
+    char const* getPluginVersion() const noexcept override;
+    PluginFieldCollection const* getFieldNames() noexcept override;
+
+protected:
+    static PluginFieldCollection mFC;
+    static std::vector<PluginField> mPluginAttributes;
+    std::string mPluginName;
+};
+
+class RoIAlign2PluginCreator : public RoIAlign2BasePluginCreator
+{
+public:
+    RoIAlign2PluginCreator() noexcept;
+    ~RoIAlign2PluginCreator() noexcept override = default;
+    IPluginV2Ext* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
+    IPluginV2Ext* deserializePlugin(char const* name, void const* serialData, size_t serialLength) noexcept override;
+};
+
+class RoIAlign2DynamicPluginCreator : public RoIAlign2BasePluginCreator
+{
+public:
+    RoIAlign2DynamicPluginCreator() noexcept;
+    ~RoIAlign2DynamicPluginCreator() noexcept override = default;
+    IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
+    IPluginV2DynamicExt* deserializePlugin(
+        char const* name, void const* serialData, size_t serialLength) noexcept override;
+};
+
+} // namespace plugin
+
+} // namespace nvinfer1
+
+#endif // ROIALIGN2_PLUGIN_H
diff --git a/plugin/roIAlignPlugin/CMakeLists.txt b/plugin/roIAlignPlugin/CMakeLists.txt
new file mode 100644
index 00000000..53b70a7e
--- /dev/null
+++ b/plugin/roIAlignPlugin/CMakeLists.txt
@@ -0,0 +1,21 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB SRCS *.cpp)
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} ${SRCS})
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
+file(GLOB CU_SRCS *.cu)
+set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} ${CU_SRCS})
+set(PLUGIN_CU_SOURCES ${PLUGIN_CU_SOURCES} PARENT_SCOPE)
diff --git a/plugin/roIAlignPlugin/README.md b/plugin/roIAlignPlugin/README.md
new file mode 100644
index 00000000..701c6131
--- /dev/null
+++ b/plugin/roIAlignPlugin/README.md
@@ -0,0 +1,3 @@
+# RoIAlignPlugin
+
+Copy from [mmdeploy.readthedocs.io/en/latest/06-custom-ops/tensorrt.html#mmcvroialign](https://mmdeploy.readthedocs.io/en/latest/06-custom-ops/tensorrt.html#mmcvroialign)
diff --git a/plugin/roIAlignPlugin/roIAlignForward.cu b/plugin/roIAlignPlugin/roIAlignForward.cu
new file mode 100644
index 00000000..52a90b16
--- /dev/null
+++ b/plugin/roIAlignPlugin/roIAlignForward.cu
@@ -0,0 +1,174 @@
+#include "roIAlignForward.h"
+
+#define THREADS_PER_BLOCK 512
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                                                                      \
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+inline int GET_BLOCKS(int const N)
+{
+    int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    int max_block_num = 4096;
+    return min(optimal_block_num, max_block_num);
+}
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    T const* input, int const height, int const width, T y, T x, int const index /* index for debug only*/)
+{
+    // deal with cases that inverse elements are out of feature map boundary
+    if (y < -1.0 || y > height || x < -1.0 || x > width)
+        return 0;
+
+    if (y <= 0)
+        y = 0;
+    if (x <= 0)
+        x = 0;
+
+    int y_low = (int) y;
+    int x_low = (int) x;
+    int y_high;
+    int x_high;
+
+    if (y_low >= height - 1)
+    {
+        y_high = y_low = height - 1;
+        y = (T) y_low;
+    }
+    else
+    {
+        y_high = y_low + 1;
+    }
+
+    if (x_low >= width - 1)
+    {
+        x_high = x_low = width - 1;
+        x = (T) x_low;
+    }
+    else
+    {
+        x_high = x_low + 1;
+    }
+
+    T ly = y - y_low;
+    T lx = x - x_low;
+    T hy = 1. - ly, hx = 1. - lx;
+    // do bilinear interpolation
+    T v1 = input[y_low * width + x_low];
+    T v2 = input[y_low * width + x_high];
+    T v3 = input[y_high * width + x_low];
+    T v4 = input[y_high * width + x_high];
+    T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+    T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+    return val;
+}
+
+template <typename T>
+__global__ void roi_align_forward_cuda_kernel(int const nthreads, T const* input, T const* rois, T* output, T* argmax_y,
+    T* argmax_x, int const pooled_height, int const pooled_width, T const spatial_scale, int const sampling_ratio,
+    int const pool_mode, // 0 - max pool, 1 - avg pool
+    bool const aligned, int const channels, int const height, int const width)
+{
+    CUDA_1D_KERNEL_LOOP(index, nthreads)
+    {
+        // (n, c, ph, pw) is an element in the pooled output
+        int pw = index % pooled_width;
+        int ph = (index / pooled_width) % pooled_height;
+        int c = (index / pooled_width / pooled_height) % channels;
+        int n = index / pooled_width / pooled_height / channels;
+
+        T const* offset_rois = rois + n * 5;
+        int roi_batch_ind = offset_rois[0];
+
+        // Do not using rounding; this implementation detail is critical
+        T offset = aligned ? (T) 0.5 : (T) 0.0;
+        T roi_start_w = offset_rois[1] * spatial_scale - offset;
+        T roi_start_h = offset_rois[2] * spatial_scale - offset;
+        T roi_end_w = offset_rois[3] * spatial_scale - offset;
+        T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+        T roi_width = roi_end_w - roi_start_w;
+        T roi_height = roi_end_h - roi_start_h;
+        if (!aligned)
+        { // for backward-compatibility only
+            roi_width = max(roi_width, (T) 1.);
+            roi_height = max(roi_height, (T) 1.);
+        }
+
+        T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+        T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+        T const* offset_input = input + (roi_batch_ind * channels + c) * height * width;
+
+        // We use roi_bin_grid to sample the grid and mimic integral
+        int roi_bin_grid_h
+            = (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceilf(roi_height / pooled_height));
+        int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceilf(roi_width / pooled_width));
+
+        if (pool_mode == 0)
+        {
+            // We do max pooling inside a bin
+            T maxval = -FLT_MAX;
+            T maxidx_y = -1.f, maxidx_x = -1.f;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++)
+            {
+                T const y = roi_start_h + ph * bin_size_h
+                    + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
+                for (int ix = 0; ix < roi_bin_grid_w; ix++)
+                {
+                    T const x = roi_start_w + pw * bin_size_w
+                        + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+                    T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+                    if (val > maxval)
+                    {
+                        maxval = val;
+                        maxidx_y = y;
+                        maxidx_x = x;
+                    }
+                }
+            }
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+        }
+        else if (pool_mode == 1)
+        {
+            // We do average pooling inside a bin
+            T const count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+            T output_val = 0.;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++)
+            {
+                T const y = roi_start_h + ph * bin_size_h
+                    + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
+                for (int ix = 0; ix < roi_bin_grid_w; ix++)
+                {
+                    T const x = roi_start_w + pw * bin_size_w
+                        + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+                    T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+                    output_val += val;
+                }
+            }
+            output[index] = output_val / count;
+        }
+    }
+}
+
+template <typename scalar_t>
+void TRTRoIAlignForwardCUDAKernelLauncher(scalar_t const* input, scalar_t const* rois, scalar_t* output,
+    scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels, int height, int width, int aligned_height,
+    int aligned_width, scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream)
+{
+    roi_align_forward_cuda_kernel<scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(output_size,
+        input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
+        sampling_ratio, pool_mode, aligned, channels, height, width);
+}
+
+void TRTRoIAlignForwardCUDAKernelLauncher_float(float const* input, float const* rois, float* output, float* argmax_y,
+    float* argmax_x, int output_size, int channels, int height, int width, int aligned_height, int aligned_width,
+    float spatial_scale, int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream)
+{
+    TRTRoIAlignForwardCUDAKernelLauncher<float>(input, rois, output, argmax_y, argmax_x, output_size, channels, height,
+        width, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned, stream);
+}
diff --git a/plugin/roIAlignPlugin/roIAlignForward.h b/plugin/roIAlignPlugin/roIAlignForward.h
new file mode 100644
index 00000000..22ffdbf1
--- /dev/null
+++ b/plugin/roIAlignPlugin/roIAlignForward.h
@@ -0,0 +1,19 @@
+#ifndef TRT_ROI_ALIGN_HELPER_H
+#define TRT_ROI_ALIGN_HELPER_H
+
+#include <float.h>
+
+#include "plugin.h"
+using namespace nvinfer1;
+using namespace nvinfer1::plugin;
+
+template <typename scalar_t>
+void TRTRoIAlignForwardCUDAKernelLauncher(scalar_t const* input, scalar_t const* rois, scalar_t* output,
+    scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels, int height, int width, int aligned_height,
+    int aligned_width, scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
+
+void TRTRoIAlignForwardCUDAKernelLauncher_float(float const* input, float const* rois, float* output, float* argmax_y,
+    float* argmax_x, int output_size, int channels, int height, int width, int aligned_height, int aligned_width,
+    float spatial_scale, int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
+
+#endif
\ No newline at end of file
diff --git a/plugin/roIAlignPlugin/roIAlignPlugin.cpp b/plugin/roIAlignPlugin/roIAlignPlugin.cpp
new file mode 100644
index 00000000..c897f038
--- /dev/null
+++ b/plugin/roIAlignPlugin/roIAlignPlugin.cpp
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "roIAlignPlugin.h"
+#include "NvInfer.h"
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <stdio.h>
+#include <vector>
+
+using namespace nvinfer1;
+using nvinfer1::plugin::RoIAlignDynamicPlugin;
+using nvinfer1::plugin::RoIAlignBasePluginCreator;
+using nvinfer1::plugin::RoIAlignDynamicPluginCreator;
+
+// plugin specific constants
+namespace
+{
+static char const* ROIALIGN_PLUGIN_VERSION{"1"};
+static char const* ROIALIGN_PLUGIN_NAME{"RoIAlignDynamic_TRT"};
+} // namespace
+
+// Static class fields initialization
+PluginFieldCollection RoIAlignBasePluginCreator::mFC{};
+std::vector<PluginField> RoIAlignBasePluginCreator::mPluginAttributes;
+
+// Helper function for serializing plugin
+template <typename T>
+void writeToBuffer(char*& buffer, T const& val)
+{
+    *reinterpret_cast<T*>(buffer) = val;
+    buffer += sizeof(T);
+}
+
+// Helper function for deserializing plugin
+template <typename T>
+T readFromBuffer(char const*& buffer)
+{
+    T val = *reinterpret_cast<T const*>(buffer);
+    buffer += sizeof(T);
+    return val;
+}
+
+inline unsigned int getElementSize(nvinfer1::DataType t)
+{
+    switch (t)
+    {
+    case nvinfer1::DataType::kINT32: return 4;
+    case nvinfer1::DataType::kFLOAT: return 4;
+    case nvinfer1::DataType::kHALF: return 2;
+    case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kINT8: return 1;
+    default: throw std::runtime_error("Invalid DataType.");
+    }
+    throw std::runtime_error("Invalid DataType.");
+    return 0;
+}
+
+RoIAlignDynamicPlugin::RoIAlignDynamicPlugin(std::string const name, int outWidth, int outHeight, float spatialScale,
+    int sampleRatio, int poolMode, bool aligned)
+    : mLayerName(name)
+    , mOutWidth(outWidth)
+    , mOutHeight(outHeight)
+    , mSpatialScale(spatialScale)
+    , mSampleRatio(sampleRatio)
+    , mPoolMode(poolMode)
+    , mAligned(aligned)
+{
+}
+
+RoIAlignDynamicPlugin::RoIAlignDynamicPlugin(std::string const name, void const* data, size_t length)
+    : mLayerName(name)
+{
+    char const *d = reinterpret_cast<char const*>(data), *a = d;
+    mOutWidth = readFromBuffer<int>(d);
+    mOutHeight = readFromBuffer<int>(d);
+    mSpatialScale = readFromBuffer<float>(d);
+    mSampleRatio = readFromBuffer<int>(d);
+    mPoolMode = readFromBuffer<int>(d);
+    mAligned = readFromBuffer<int>(d);
+    PLUGIN_VALIDATE(d == a + length);
+}
+
+RoIAlignDynamicPlugin::~RoIAlignDynamicPlugin() noexcept {}
+
+char const* RoIAlignDynamicPlugin::getPluginType() const noexcept
+{
+    return ROIALIGN_PLUGIN_NAME;
+}
+
+char const* RoIAlignDynamicPlugin::getPluginVersion() const noexcept
+{
+    return ROIALIGN_PLUGIN_VERSION;
+}
+
+int RoIAlignDynamicPlugin::getNbOutputs() const noexcept
+{
+    return 1;
+}
+
+DimsExprs RoIAlignDynamicPlugin::getOutputDimensions(
+    int outputIndex, DimsExprs const* inputs, int nbInputs, IExprBuilder& exprBuilder) noexcept
+{
+    // Validate input arguments
+    PLUGIN_VALIDATE(outputIndex == 0);
+    PLUGIN_VALIDATE(nbInputs == 2);
+
+    // Shape of feature_map input should be
+    // Constant shape: [batch_size, C, W, H] or Dynamic shape: some dimension values may be -1
+    PLUGIN_VALIDATE(inputs[0].nbDims == 4);
+
+    // Shape of roi input should be
+    // Constant shape: [R, 5] or Dynamic shape: some dimension values may be -1
+    PLUGIN_VALIDATE(inputs[1].nbDims == 2);
+
+    DimsExprs out_dim;
+    out_dim.nbDims = 4;
+    out_dim.d[0] = inputs[1].d[0];
+    out_dim.d[1] = inputs[0].d[1];
+    out_dim.d[2] = exprBuilder.constant(mOutHeight);
+    out_dim.d[3] = exprBuilder.constant(mOutWidth);
+    return out_dim;
+}
+
+int RoIAlignDynamicPlugin::initialize() noexcept
+{
+    return STATUS_SUCCESS;
+}
+
+size_t RoIAlignDynamicPlugin::getWorkspaceSize(
+    PluginTensorDesc const* inputs, int nbInputs, PluginTensorDesc const* outputs, int nbOutputs) const noexcept
+{
+    size_t output_size = 0;
+    size_t word_size = 0;
+    switch (mPoolMode)
+    {
+    case 0: // max
+        output_size = outputs[0].dims.d[0] * outputs[0].dims.d[1] * outputs[0].dims.d[2] * outputs[0].dims.d[3];
+        word_size = getElementSize(outputs[0].type);
+        return output_size * word_size * 2;
+        break;
+    case 1: return 0; break;
+    default: return 0;
+    }
+    return 0;
+}
+
+int RoIAlignDynamicPlugin::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
+    void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept
+{
+    int channels = inputDesc[0].dims.d[1];
+    int height = inputDesc[0].dims.d[2];
+    int width = inputDesc[0].dims.d[3];
+
+    int output_size
+        = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] * outputDesc[0].dims.d[2] * outputDesc[0].dims.d[3];
+    int word_size = getElementSize(outputDesc[0].type);
+
+    void const* feat = inputs[0];
+    void const* rois = inputs[1];
+    void* output = outputs[0];
+    void* argmax_y = nullptr;
+    void* argmax_x = nullptr;
+
+    switch (mPoolMode)
+    {
+    case 0: // max
+        argmax_y = workspace;
+        argmax_x = argmax_y + output_size * word_size;
+        break;
+    case 1: // avg
+        break;
+    }
+
+    switch (outputDesc[0].type)
+    {
+    case DataType::kFLOAT:
+        TRTRoIAlignForwardCUDAKernelLauncher_float((float const*) feat, (float const*) rois, (float*) output,
+            (float*) argmax_y, (float*) argmax_x, output_size, channels, height, width, mOutHeight, mOutWidth,
+            mSpatialScale, mSampleRatio, mPoolMode, mAligned, stream);
+        break;
+    case DataType::kHALF:
+        // TODO:
+        break;
+    default: break;
+    }
+
+    return 0;
+}
+
+size_t RoIAlignDynamicPlugin::getSerializationSize() const noexcept
+{
+    return 5 * sizeof(int) + 1 * sizeof(float);
+}
+
+void RoIAlignDynamicPlugin::serialize(void* buffer) const noexcept
+{
+    char *d = reinterpret_cast<char*>(buffer), *a = d;
+    writeToBuffer<int>(d, mOutWidth);
+    writeToBuffer<int>(d, mOutHeight);
+    writeToBuffer<float>(d, mSpatialScale);
+    writeToBuffer<int>(d, mSampleRatio);
+    writeToBuffer<int>(d, mPoolMode);
+    writeToBuffer<int>(d, mAligned);
+    PLUGIN_VALIDATE(d == a + getSerializationSize());
+}
+
+bool RoIAlignDynamicPlugin::supportsFormatCombination(
+    int pos, PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept
+{
+    // 2 inputs, 1 outputs, so 3 input/output in total
+    PLUGIN_VALIDATE(0 <= pos && pos < 3);
+    auto const* in = inOut;
+    auto const* out = inOut + nbInputs;
+    bool const consistentFloatPrecision = (in[0].type == in[pos].type);
+    switch (pos)
+    {
+    case 0: return in[0].type == DataType::kFLOAT && in[0].format == PluginFormat::kLINEAR && consistentFloatPrecision;
+    case 1: return in[1].type == DataType::kFLOAT && in[1].format == PluginFormat::kLINEAR && consistentFloatPrecision;
+    case 2:
+        return out[0].type == DataType::kFLOAT && out[0].format == PluginFormat::kLINEAR && consistentFloatPrecision;
+    }
+    return false;
+}
+
+void RoIAlignDynamicPlugin::terminate() noexcept {}
+
+void RoIAlignDynamicPlugin::destroy() noexcept
+{
+    // This gets called when the network containing plugin is destroyed
+    delete this;
+}
+
+IPluginV2DynamicExt* RoIAlignDynamicPlugin::clone() const noexcept
+{
+    auto* plugin = new RoIAlignDynamicPlugin(
+        mLayerName, mOutWidth, mOutHeight, mSpatialScale, mSampleRatio, mPoolMode, mAligned);
+    plugin->setPluginNamespace(mNamespace.c_str());
+    return plugin;
+}
+
+void RoIAlignDynamicPlugin::setPluginNamespace(char const* libNamespace) noexcept
+{
+    mNamespace = libNamespace;
+}
+
+char const* RoIAlignDynamicPlugin::getPluginNamespace() const noexcept
+{
+    return mNamespace.c_str();
+}
+
+DataType RoIAlignDynamicPlugin::getOutputDataType(
+    int index, nvinfer1::DataType const* inputTypes, int nbInputs) const noexcept
+{
+    return inputTypes[0];
+}
+
+void RoIAlignDynamicPlugin::configurePlugin(
+    DynamicPluginTensorDesc const* in, int nbInputs, DynamicPluginTensorDesc const* out, int nbOutputs) noexcept
+{
+    PLUGIN_VALIDATE(nbInputs == 2);
+    PLUGIN_VALIDATE(nbOutputs == 1);
+}
+
+RoIAlignBasePluginCreator::RoIAlignBasePluginCreator() noexcept
+{
+    mPluginAttributes.clear();
+
+    mPluginAttributes.emplace_back(PluginField("output_width", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("output_height", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("spatial_scale", nullptr, PluginFieldType::kFLOAT32, 1));
+    mPluginAttributes.emplace_back(PluginField("sampling_ratio", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("mode", nullptr, PluginFieldType::kINT32, 1));
+    mPluginAttributes.emplace_back(PluginField("aligned", nullptr, PluginFieldType::kINT32, 1));
+    mFC.nbFields = mPluginAttributes.size();
+    mFC.fields = mPluginAttributes.data();
+}
+
+RoIAlignDynamicPluginCreator::RoIAlignDynamicPluginCreator() noexcept
+{
+    mPluginName = ROIALIGN_PLUGIN_NAME;
+}
+
+char const* RoIAlignBasePluginCreator::getPluginName() const noexcept
+{
+    return mPluginName.c_str();
+}
+
+char const* RoIAlignBasePluginCreator::getPluginVersion() const noexcept
+{
+    return ROIALIGN_PLUGIN_VERSION;
+}
+
+PluginFieldCollection const* RoIAlignBasePluginCreator::getFieldNames() noexcept
+{
+    return &mFC;
+}
+
+IPluginV2DynamicExt* RoIAlignDynamicPluginCreator::createPlugin(
+    char const* name, PluginFieldCollection const* fc) noexcept
+{
+    PluginField const* fields = fc->fields;
+    int nbFields = fc->nbFields;
+
+    int outWidth = 7;
+    int outHeight = 7;
+    float spatialScale = 1.0;
+    int sampleRatio = 0;
+    int poolMode = -1;
+    bool aligned = true;
+
+    for (int i = 0; i < nbFields; ++i)
+    {
+        char const* attr_name = fields[i].name;
+
+        if (!strcmp(attr_name, "output_height"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            outHeight = *(static_cast<int const*>(fields[i].data));
+        }
+        else if (!strcmp(attr_name, "output_width"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            outWidth = *(static_cast<int const*>(fields[i].data));
+        }
+        else if (!strcmp(attr_name, "spatial_scale"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kFLOAT32);
+            spatialScale = *(static_cast<float const*>(fields[i].data));
+        }
+        else if (!strcmp(attr_name, "sampling_ratio"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            sampleRatio = *(static_cast<int const*>(fields[i].data));
+        }
+        else if (!strcmp(attr_name, "mode"))
+        {
+            int data_size = fc->fields[i].length;
+            char const* data_start = static_cast<char const*>(fc->fields[i].data);
+            std::string poolModeStr(data_start, data_size);
+            if (poolModeStr == "avg")
+            {
+                poolMode = 1;
+            }
+            else if (poolModeStr == "max")
+            {
+                poolMode = 0;
+            }
+            else
+            {
+                std::cout << "Unknown pool mode \"" << poolModeStr << "\"." << std::endl;
+            }
+            PLUGIN_VALIDATE(poolMode >= 0);
+        }
+        else if (!strcmp(attr_name, "aligned"))
+        {
+            PLUGIN_VALIDATE(fields[i].type == PluginFieldType::kINT32);
+            int aligned_int = *(static_cast<int const*>(fields[i].data));
+            aligned = aligned_int != 0;
+        }
+    }
+
+    PLUGIN_VALIDATE(outHeight > 0 && outWidth > 0 && spatialScale > 0.0f && poolMode >= 0);
+
+    IPluginV2DynamicExt* plugin
+        = new RoIAlignDynamicPlugin(name, outWidth, outHeight, spatialScale, sampleRatio, poolMode, aligned);
+    plugin->setPluginNamespace(mNamespace.c_str());
+    return plugin;
+}
+
+IPluginV2DynamicExt* RoIAlignDynamicPluginCreator::deserializePlugin(
+    char const* name, void const* serialData, size_t serialLength) noexcept
+{
+    // This object will be deleted when the network is destroyed,
+    IPluginV2DynamicExt* plugin = new RoIAlignDynamicPlugin(name, serialData, serialLength);
+    plugin->setPluginNamespace(mNamespace.c_str());
+    return plugin;
+}
diff --git a/plugin/roIAlignPlugin/roIAlignPlugin.h b/plugin/roIAlignPlugin/roIAlignPlugin.h
new file mode 100644
index 00000000..5b669e69
--- /dev/null
+++ b/plugin/roIAlignPlugin/roIAlignPlugin.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ROIALIGN_PLUGIN_H
+#define ROIALIGN_PLUGIN_H
+
+#include "NvInferPlugin.h"
+#include "common/kernel.h"
+#include "common/plugin.h"
+#include "roIAlignForward.h"
+#include <string>
+#include <vector>
+
+using namespace nvinfer1::plugin;
+
+// One of the preferred ways of making TensorRT to be able to see
+// our custom layer requires extending IPluginV2Ext and BaseCreator classes.
+// For requirements for overriden functions, check TensorRT API docs.
+namespace nvinfer1
+{
+namespace plugin
+{
+
+class RoIAlignDynamicPlugin : public IPluginV2DynamicExt
+{
+public:
+    RoIAlignDynamicPlugin(std::string const name);
+
+    RoIAlignDynamicPlugin(std::string const name, int outWidth, int outHeight, float spatialScale, int sampleRatio,
+        int poolMode, bool aligned);
+
+    RoIAlignDynamicPlugin(std::string const name, void const* data, size_t length);
+
+    // It doesn't make sense to make RoIAlignDynamicPlugin without arguments, so we delete default constructor.
+    RoIAlignDynamicPlugin() noexcept = delete;
+
+    ~RoIAlignDynamicPlugin() noexcept override;
+
+    // IPluginV2 methods
+    char const* getPluginType() const noexcept override;
+    char const* getPluginVersion() const noexcept override;
+    int getNbOutputs() const noexcept override;
+    int initialize() noexcept override;
+    void terminate() noexcept override;
+    size_t getSerializationSize() const noexcept override;
+    void serialize(void* buffer) const noexcept override;
+    void destroy() noexcept override;
+    void setPluginNamespace(char const* libNamespace) noexcept override;
+    char const* getPluginNamespace() const noexcept override;
+
+    // IPluginV2Ext methods
+    DataType getOutputDataType(int index, nvinfer1::DataType const* inputType, int nbInputs) const noexcept override;
+
+    // IPluginV2DynamicExt methods
+    IPluginV2DynamicExt* clone() const noexcept override;
+    DimsExprs getOutputDimensions(
+        int outputIndex, DimsExprs const* inputs, int nbInputs, IExprBuilder& exprBuilder) noexcept override;
+    bool supportsFormatCombination(
+        int pos, PluginTensorDesc const* inOut, int nbInputs, int nbOutputs) noexcept override;
+    void configurePlugin(DynamicPluginTensorDesc const* in, int nbInputs, DynamicPluginTensorDesc const* out,
+        int nbOutputs) noexcept override;
+    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int nbInputs, PluginTensorDesc const* outputs,
+        int nbOutputs) const noexcept override;
+    int enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
+        void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
+
+private:
+    std::string const mLayerName;
+    std::string mNamespace;
+    int mOutWidth;
+    int mOutHeight;
+    float mSpatialScale;
+    int mSampleRatio;
+    int mPoolMode; // 1:avg 0:max
+    bool mAligned;
+};
+
+class RoIAlignBasePluginCreator : public BaseCreator
+{
+public:
+    RoIAlignBasePluginCreator() noexcept;
+    ~RoIAlignBasePluginCreator() noexcept override = default;
+    char const* getPluginName() const noexcept override;
+    char const* getPluginVersion() const noexcept override;
+    PluginFieldCollection const* getFieldNames() noexcept override;
+
+protected:
+    static PluginFieldCollection mFC;
+    static std::vector<PluginField> mPluginAttributes;
+    std::string mPluginName;
+};
+
+class RoIAlignPluginCreator : public RoIAlignBasePluginCreator
+{
+public:
+    RoIAlignPluginCreator() noexcept;
+    ~RoIAlignPluginCreator() noexcept override = default;
+    IPluginV2Ext* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
+    IPluginV2Ext* deserializePlugin(char const* name, void const* serialData, size_t serialLength) noexcept override;
+};
+
+class RoIAlignDynamicPluginCreator : public RoIAlignBasePluginCreator
+{
+public:
+    RoIAlignDynamicPluginCreator() noexcept;
+    ~RoIAlignDynamicPluginCreator() noexcept override = default;
+    IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
+    IPluginV2DynamicExt* deserializePlugin(
+        char const* name, void const* serialData, size_t serialLength) noexcept override;
+};
+
+} // namespace plugin
+
+} // namespace nvinfer1
+
+#endif // ROIALIGN_PLUGIN_H