diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66274d34..a8d36f32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,7 @@ if(LLVM_PACKAGE_VERSION VERSION_GREATER "9.0.1")
 endif()
 
 if(LLVM_PACKAGE_VERSION VERSION_EQUAL "15.0.0" OR LLVM_PACKAGE_VERSION VERSION_GREATER "15.0.0")
-    target_link_libraries(hipify-clang PRIVATE LLVMWindowsDriver)
+    target_link_libraries(hipify-clang PRIVATE LLVMWindowsDriver clangSupport)
 endif()
 
 if(MSVC)
@@ -86,12 +86,6 @@ else()
     set(addr_var )
 endif()
 
-# [ToDo] Remove SWDEV_331863 related guards from CMakeLists.txt and HipifyAction.cpp when the blocker SWDEV_331863 is overcome
-option (SWDEV_331863 "Enables SWDEV-331863 blocker workaround" OFF)
-if(SWDEV_331863)
-    add_definitions(-DSWDEV_331863)
-endif()
-
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS} ${addr_var}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CFLAGS} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DIRS}/clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}\\\" ${addr_var}")
 
diff --git a/README.md b/README.md
index c43bbd8c..e9e983c4 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ After applying all the matchers, the output HIP source is produced.
 
 1. [**LLVM+CLANG**](http://releases.llvm.org) of at least version [4.0.0](http://releases.llvm.org/download.html#4.0.0); the latest stable and recommended release: [**14.0.6**](https://github.com/llvm/llvm-project/releases/tag/llvmorg-14.0.6).
 
-2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [8.0](https://developer.nvidia.com/cuda-80-ga2-download-archive), the latest supported version is [**11.7.0**](https://developer.nvidia.com/cuda-downloads).
+2. [**CUDA**](https://developer.nvidia.com/cuda-downloads) of at least version [8.0](https://developer.nvidia.com/cuda-80-ga2-download-archive), the latest supported version is [**11.7.1**](https://developer.nvidia.com/cuda-downloads).
 
 <table align="center">
   <thead>
@@ -160,7 +160,7 @@ After applying all the matchers, the output HIP source is produced.
           <a href="https://github.com/llvm/llvm-project/releases/tag/llvmorg-14.0.2">14.0.2</a>,
           <a href="https://github.com/llvm/llvm-project/releases/tag/llvmorg-14.0.3">14.0.3</a>,
           <a href="https://github.com/llvm/llvm-project/releases/tag/llvmorg-14.0.4">14.0.4</a>
-      <td bgcolor="eefaeb"><a href="https://developer.nvidia.com/cuda-downloads">11.7.0</a></td>
+      <td bgcolor="eefaeb"><a href="https://developer.nvidia.com/cuda-downloads">11.7.1</a></td>
       <td>works only with the patch <br> due to the clang's bug <a href="https://github.com/llvm/llvm-project/issues/54609">54609</a><br>
           <a href="patches/patch_for_clang_14.0.0_bug_54609.zip">patch for 14.0.0</a>**<br>
           <a href="patches/patch_for_clang_14.0.1_bug_54609.zip">patch for 14.0.1</a>**<br>
@@ -171,7 +171,7 @@ After applying all the matchers, the output HIP source is produced.
     <tr align="center">
       <td><a href="https://github.com/llvm/llvm-project/releases/tag/llvmorg-14.0.5">14.0.5</a>,
           <a href="https://github.com/llvm/llvm-project/releases/tag/llvmorg-14.0.6"><b>14.0.6</b></a></td>
-      <td bgcolor="eefaeb"><a href="https://developer.nvidia.com/cuda-downloads"><b>11.7.0</b></a></td>
+      <td bgcolor="eefaeb"><a href="https://developer.nvidia.com/cuda-downloads"><b>11.7.1</b></a></td>
       <td colspan=2 bgcolor="eefaeb"><font color="green"><b>LATEST STABLE CONFIG</b></font></td>
     </tr>
   </tbody>
@@ -385,7 +385,7 @@ Ubuntu 14: LLVM 4.0.0 - 7.1.0, CUDA 7.0 - 9.0, cuDNN 5.0.5 - 7.6.5
 
 Ubuntu 16-18: LLVM 8.0.0 - 14.0.6, CUDA 8.0 - 10.2, cuDNN 5.1.10 - 8.0.5
 
-Ubuntu 20-21: LLVM 9.0.0 - 14.0.6, CUDA 8.0 - 11.7.0, cuDNN 5.1.10 - 8.4.1
+Ubuntu 20-21: LLVM 9.0.0 - 14.0.6, CUDA 8.0 - 11.7.1, cuDNN 5.1.10 - 8.4.1
 
 Minimum build system requirements for the above configurations:
 
@@ -560,8 +560,8 @@ Testing Time: 6.03s
 | 10.0.0 - 11.0.0 | 8.0 - 11.1   | 7.6.5  - 8.0.5 | 2017.15.9.30, 2019.16.8.3                | 3.19.2         | 3.9.1        |
 | 11.0.1 - 11.1.0 | 8.0 - 11.2.2 | 7.6.5  - 8.0.5 | 2017.15.9.31, 2019.16.8.4                | 3.19.3         | 3.9.2        |
 | 12.0.0 - 13.0.1 | 8.0 - 11.5.1 | 7.6.5  - 8.3.2 | 2017.15.9.43, 2019.16.11.9               | 3.22.2         | 3.10.2       |
-| 14.0.0 - 14.0.6 | 8.0 - 11.7.0 | 8.0.5  - 8.4.1 | 2017.15.9.49, 2019.16.11.17, 2022.17.2.6 | 3.23.2         | 3.10.5       |
-| 15.0.0git       | 8.0 - 11.7.0 | 8.0.5  - 8.4.1 | 2017.15.9.49, 2019.16.11.17, 2022.17.2.6 | 3.23.2         | 3.10.5       |
+| 14.0.0 - 14.0.6 | 8.0 - 11.7.1 | 8.0.5  - 8.4.1 | 2017.15.9.49, 2019.16.11.17, 2022.17.2.6 | 3.24.0         | 3.10.6       |
+| 15.0.0git       | 8.0 - 11.7.1 | 8.0.5  - 8.4.1 | 2017.15.9.49, 2019.16.11.17, 2022.17.2.6 | 3.24.0         | 3.10.6       |
 
 *Building with testing support by `Visual Studio 17 2022` on `Windows 10`:*
 
diff --git a/bin/hipify-perl b/bin/hipify-perl
index 45e1e372..07914304 100755
--- a/bin/hipify-perl
+++ b/bin/hipify-perl
@@ -967,7 +967,7 @@ push(@exclude_filelist, split(',', $exclude_files));
 %exclude_dirhash = map { $_ => 1 } @exclude_dirlist;
 %exclude_filehash = map { $_ => 1 } @exclude_filelist;
 
-@statNames = ("error", "init", "version", "device", "context", "module", "memory", "virtual_memory", "stream_ordered_memory", "addressing", "stream", "event", "external_resource_interop", "stream_memory", "execution", "graph", "occupancy", "texture", "surface", "peer", "graphics", "interactions", "profiler", "openGL", "D3D9", "D3D10", "D3D11", "VDPAU", "EGL", "thread", "complex", "library", "device_library", "device_function", "include", "include_cuda_main_header", "type", "literal", "numeric_literal", "define", "extern_shared", "kernel_launch");
+@statNames = ("error", "init", "version", "device", "context", "module", "memory", "virtual_memory", "stream_ordered_memory", "addressing", "stream", "event", "external_resource_interop", "stream_memory", "execution", "graph", "occupancy", "texture", "surface", "peer", "graphics", "interactions", "profiler", "openGL", "D3D9", "D3D10", "D3D11", "VDPAU", "EGL", "thread", "complex", "library", "device_library", "device_function", "include", "include_cuda_main_header", "include_cuda_main_header_v2", "type", "literal", "numeric_literal", "define", "extern_shared", "kernel_launch");
 
 sub totalStats {
     my %count = %{shift()};
@@ -1599,7 +1599,6 @@ sub rocSubstitutions {
     subst("cublasZtrsv", "rocblas_ztrsv", "library");
     subst("cublasZtrsv_v2", "rocblas_ztrsv", "library");
     subst("cublas.h", "rocblas.h", "include_cuda_main_header");
-    subst("cublas_v2.h", "rocblas.h", "include_cuda_main_header");
     subst("cublasAtomicsMode_t", "rocblas_atomics_mode", "type");
     subst("cublasContext", "_rocblas_handle", "type");
     subst("cublasDataType_t", "rocblas_datatype", "type");
@@ -1935,6 +1934,7 @@ sub simpleSubstitutions {
     subst("cuGraphAddHostNode", "hipGraphAddHostNode", "graph");
     subst("cuGraphAddKernelNode", "hipGraphAddKernelNode", "graph");
     subst("cuGraphChildGraphNodeGetGraph", "hipGraphChildGraphNodeGetGraph", "graph");
+    subst("cuGraphClone", "hipGraphClone", "graph");
     subst("cuGraphCreate", "hipGraphCreate", "graph");
     subst("cuGraphDestroy", "hipGraphDestroy", "graph");
     subst("cuGraphDestroyNode", "hipGraphDestroyNode", "graph");
@@ -1982,6 +1982,7 @@ sub simpleSubstitutions {
     subst("cudaGraphAddMemcpyNodeToSymbol", "hipGraphAddMemcpyNodeToSymbol", "graph");
     subst("cudaGraphAddMemsetNode", "hipGraphAddMemsetNode", "graph");
     subst("cudaGraphChildGraphNodeGetGraph", "hipGraphChildGraphNodeGetGraph", "graph");
+    subst("cudaGraphClone", "hipGraphClone", "graph");
     subst("cudaGraphCreate", "hipGraphCreate", "graph");
     subst("cudaGraphDestroy", "hipGraphDestroy", "graph");
     subst("cudaGraphDestroyNode", "hipGraphDestroyNode", "graph");
@@ -2075,6 +2076,7 @@ sub simpleSubstitutions {
     subst("cudaGetTextureAlignmentOffset", "hipGetTextureAlignmentOffset", "texture");
     subst("cudaGetTextureObjectResourceDesc", "hipGetTextureObjectResourceDesc", "texture");
     subst("cudaGetTextureObjectResourceViewDesc", "hipGetTextureObjectResourceViewDesc", "texture");
+    subst("cudaGetTextureObjectTextureDesc", "hipGetTextureObjectTextureDesc", "texture");
     subst("cudaGetTextureReference", "hipGetTextureReference", "texture");
     subst("cudaUnbindTexture", "hipUnbindTexture", "texture");
     subst("cudaCreateSurfaceObject", "hipCreateSurfaceObject", "surface");
@@ -3238,8 +3240,9 @@ sub simpleSubstitutions {
     subst("caffe2\/operators\/spatial_batch_norm_op.h", "caffe2\/operators\/hip\/spatial_batch_norm_op_miopen.hip", "include");
     subst("channel_descriptor.h", "hip\/channel_descriptor.h", "include");
     subst("cooperative_groups.h", "hip\/hip_cooperative_groups.h", "include");
+    subst("cublas_api.h", "hipblas.h", "include");
     subst("cuda_fp16.h", "hip\/hip_fp16.h", "include");
-    subst("cuda_profiler_api.h", "hip\/hip_profile.h", "include");
+    subst("cuda_profiler_api.h", "hip\/hip_runtime_api.h", "include");
     subst("cuda_runtime_api.h", "hip\/hip_runtime_api.h", "include");
     subst("cuda_texture_types.h", "hip\/hip_texture_types.h", "include");
     subst("cufftXt.h", "hipfftXt.h", "include");
@@ -3266,7 +3269,6 @@ sub simpleSubstitutions {
     subst("cuComplex.h", "hip\/hip_complex.h", "include_cuda_main_header");
     subst("cub\/cub.cuh", "hipcub\/hipcub.hpp", "include_cuda_main_header");
     subst("cublas.h", "hipblas.h", "include_cuda_main_header");
-    subst("cublas_v2.h", "hipblas.h", "include_cuda_main_header");
     subst("cuda.h", "hip\/hip_runtime.h", "include_cuda_main_header");
     subst("cuda_runtime.h", "hip\/hip_runtime.h", "include_cuda_main_header");
     subst("cudnn.h", "hipDNN.h", "include_cuda_main_header");
@@ -4644,6 +4646,7 @@ sub simpleSubstitutions {
     subst("REGISTER_CUDA_OPERATOR", "REGISTER_HIP_OPERATOR", "define");
     subst("REGISTER_CUDA_OPERATOR_CREATOR", "REGISTER_HIP_OPERATOR_CREATOR", "define");
     subst("_CubLog", "_HipcubLog", "define");
+    subst("__CUB_ALIGN_BYTES", "__HIPCUB_ALIGN_BYTES", "define");
     subst("__CUDACC__", "__HIPCC__", "define");
     subst("cudaArrayCubemap", "hipArrayCubemap", "define");
     subst("cudaArrayDefault", "hipArrayDefault", "define");
@@ -6132,7 +6135,6 @@ sub warnUnsupportedFunctions {
         "cudaGraphDebugDotFlagsExtSemasSignalNodeParams",
         "cudaGraphDebugDotFlagsEventNodeParams",
         "cudaGraphDebugDotFlags",
-        "cudaGraphClone",
         "cudaGraphAddMemFreeNode",
         "cudaGraphAddMemAllocNode",
         "cudaGraphAddExternalSemaphoresWaitNode",
@@ -6573,7 +6575,6 @@ sub warnUnsupportedFunctions {
         "cuGraphExecExternalSemaphoresSignalNodeSetParams",
         "cuGraphExecBatchMemOpNodeSetParams",
         "cuGraphDebugDotPrint",
-        "cuGraphClone",
         "cuGraphBatchMemOpNodeSetParams",
         "cuGraphBatchMemOpNodeGetParams",
         "cuGraphAddMemsetNode",
@@ -6678,10 +6679,14 @@ sub warnUnsupportedFunctions {
         "bsrilu02Info",
         "bsric02Info",
         "__curand_umul",
+        "__CUB_LP64__",
+        "_CUB_ASM_PTR_SIZE_",
+        "_CUB_ASM_PTR_",
         "PATCH_LEVEL",
         "MINOR_VERSION",
         "MAX_CUFFT_ERROR",
         "MAJOR_VERSION",
+        "CubVector",
         "CubDebugExit",
         "CUuserObject_st",
         "CUuserObject_flags_enum",
@@ -7688,14 +7693,22 @@ sub warnUnsupportedFunctions {
         "CUDA_ARRAY3D_2DARRAY",
         "CUB_USE_COOPERATIVE_GROUPS",
         "CUB_SUBSCRIPTION_FACTOR",
+        "CUB_STATIC_ASSERT",
         "CUB_SMEM_BANKS",
         "CUB_RUNTIME_ENABLED",
+        "CUB_ROUND_UP_NEAREST",
+        "CUB_ROUND_DOWN_NEAREST",
+        "CUB_QUOTIENT_FLOOR",
+        "CUB_QUOTIENT_CEILING",
         "CUB_PTX_SUBSCRIPTION_FACTOR",
         "CUB_PTX_SMEM_BANKS",
         "CUB_PTX_PREFER_CONFLICT_OVER_PADDING",
         "CUB_PTX_LOG_WARP_THREADS",
         "CUB_PTX_LOG_SMEM_BANKS",
+        "CUB_PREVENT_MACRO_SUBSTITUTION",
         "CUB_PREFER_CONFLICT_OVER_PADDING",
+        "CUB_MSVC_VERSION_FULL",
+        "CUB_MSVC_VERSION",
         "CUB_MAX_DEVICES",
         "CUB_LOG_WARP_THREADS",
         "CUB_LOG_SMEM_BANKS",
@@ -7703,7 +7716,35 @@ sub warnUnsupportedFunctions {
         "CUB_IS_DEVICE_CODE",
         "CUB_INCLUDE_HOST_CODE",
         "CUB_INCLUDE_DEVICE_CODE",
-        "CUB_CPP_DIALECT"
+        "CUB_IGNORE_DEPRECATED_DIALECT",
+        "CUB_IGNORE_DEPRECATED_CPP_DIALECT",
+        "CUB_IGNORE_DEPRECATED_CPP_11",
+        "CUB_IGNORE_DEPRECATED_COMPILER",
+        "CUB_IGNORE_DEPRECATED_API",
+        "CUB_HOST_COMPILER_UNKNOWN",
+        "CUB_HOST_COMPILER_MSVC",
+        "CUB_HOST_COMPILER_GCC",
+        "CUB_HOST_COMPILER_CLANG",
+        "CUB_HOST_COMPILER",
+        "CUB_DEVICE_COMPILER_UNKNOWN",
+        "CUB_DEVICE_COMPILER_NVCC",
+        "CUB_DEVICE_COMPILER_MSVC",
+        "CUB_DEVICE_COMPILER_GCC",
+        "CUB_DEVICE_COMPILER_CLANG",
+        "CUB_DEVICE_COMPILER",
+        "CUB_DEPRECATED",
+        "CUB_DEFINE_VECTOR_TYPE",
+        "CUB_DEFINE_DETECT_NESTED_TYPE",
+        "CUB_CPP_DIALECT",
+        "CUB_CPLUSPLUS",
+        "CUB_COMP_DEPR_IMPL1",
+        "CUB_COMP_DEPR_IMPL0",
+        "CUB_COMP_DEPR_IMPL",
+        "CUB_COMPILER_DEPRECATION_SOFT",
+        "CUB_COMPILER_DEPRECATION",
+        "CUB_CAT_",
+        "CUB_CAT",
+        "CUB_ALIGN"
     )
     {
         my $mt = m/($func)/g;
@@ -8153,7 +8194,7 @@ while (@ARGV) {
             transformHostFunctions();
             # TODO: would like to move this code outside loop but it uses $_ which contains the whole file
             unless ($no_output) {
-                my $apiCalls   = $ft{'error'} + $ft{'init'} + $ft{'version'} + $ft{'device'} + $ft{'context'} + $ft{'module'} + $ft{'memory'} + $ft{'virtual_memory'} + $ft{'stream_ordered_memory'} + $ft{'addressing'} + $ft{'stream'} + $ft{'event'} + $ft{'external_resource_interop'} + $ft{'stream_memory'} + $ft{'execution'} + $ft{'graph'} + $ft{'occupancy'} + $ft{'texture'} + $ft{'surface'} + $ft{'peer'} + $ft{'graphics'} + $ft{'interactions'} + $ft{'profiler'} + $ft{'openGL'} + $ft{'D3D9'} + $ft{'D3D10'} + $ft{'D3D11'} + $ft{'VDPAU'} + $ft{'EGL'} + $ft{'thread'} + $ft{'complex'} + $ft{'library'} + $ft{'device_library'} + $ft{'include'} + $ft{'include_cuda_main_header'} + $ft{'type'} + $ft{'literal'} + $ft{'numeric_literal'} + $ft{'define'};
+                my $apiCalls   = $ft{'error'} + $ft{'init'} + $ft{'version'} + $ft{'device'} + $ft{'context'} + $ft{'module'} + $ft{'memory'} + $ft{'virtual_memory'} + $ft{'stream_ordered_memory'} + $ft{'addressing'} + $ft{'stream'} + $ft{'event'} + $ft{'external_resource_interop'} + $ft{'stream_memory'} + $ft{'execution'} + $ft{'graph'} + $ft{'occupancy'} + $ft{'texture'} + $ft{'surface'} + $ft{'peer'} + $ft{'graphics'} + $ft{'interactions'} + $ft{'profiler'} + $ft{'openGL'} + $ft{'D3D9'} + $ft{'D3D10'} + $ft{'D3D11'} + $ft{'VDPAU'} + $ft{'EGL'} + $ft{'thread'} + $ft{'complex'} + $ft{'library'} + $ft{'device_library'} + $ft{'include'} + $ft{'include_cuda_main_header'} + $ft{'include_cuda_main_header_v2'} + $ft{'type'} + $ft{'literal'} + $ft{'numeric_literal'} + $ft{'define'};
                 my $kernStuff  = $hasDeviceCode + $ft{'kernel_launch'} + $ft{'device_function'};
                 my $totalCalls = $apiCalls + $kernStuff;
                 $is_dos = m/\r\n$/;
diff --git a/doc/markdown/CUBLAS_API_supported_by_HIP.md b/doc/markdown/CUBLAS_API_supported_by_HIP.md
index c487dc3a..238db117 100644
--- a/doc/markdown/CUBLAS_API_supported_by_HIP.md
+++ b/doc/markdown/CUBLAS_API_supported_by_HIP.md
@@ -63,7 +63,7 @@
 |`CUBLAS_GEMM_ALGO8_TENSOR_OP`|9.2| | | | | | | |
 |`CUBLAS_GEMM_ALGO9`|9.0| | | | | | | |
 |`CUBLAS_GEMM_ALGO9_TENSOR_OP`|9.2| | | | | | | |
-|`CUBLAS_GEMM_DEFAULT`|8.0| | |`HIPBLAS_GEMM_DEFAULT`|1.8.2| | | |
+|`CUBLAS_GEMM_DEFAULT`|9.0| | |`HIPBLAS_GEMM_DEFAULT`|1.8.2| | | |
 |`CUBLAS_GEMM_DEFAULT_TENSOR_OP`|9.0| | | | | | | |
 |`CUBLAS_GEMM_DFALT`|8.0| | |`HIPBLAS_GEMM_DEFAULT`|1.8.2| | | |
 |`CUBLAS_GEMM_DFALT_TENSOR_OP`|9.0| | | | | | | |
diff --git a/doc/markdown/CUB_API_supported_by_HIP.md b/doc/markdown/CUB_API_supported_by_HIP.md
index 51c19d90..c0015286 100644
--- a/doc/markdown/CUB_API_supported_by_HIP.md
+++ b/doc/markdown/CUB_API_supported_by_HIP.md
@@ -4,19 +4,50 @@
 
 |**CUDA**|**A**|**D**|**R**|**HIP**|**A**|**D**|**R**|**E**|
 |:--|:-:|:-:|:-:|:--|:-:|:-:|:-:|:-:|
+|`CUB_ALIGN`| | | | | | | | |
+|`CUB_CAT`| | | | | | | | |
+|`CUB_CAT_`| | | | | | | | |
+|`CUB_COMPILER_DEPRECATION`| | | | | | | | |
+|`CUB_COMPILER_DEPRECATION_SOFT`| | | | | | | | |
+|`CUB_COMP_DEPR_IMPL`| | | | | | | | |
+|`CUB_COMP_DEPR_IMPL0`| | | | | | | | |
+|`CUB_COMP_DEPR_IMPL1`| | | | | | | | |
+|`CUB_CPLUSPLUS`| | | | | | | | |
 |`CUB_CPP_DIALECT`| | | | | | | | |
+|`CUB_DEFINE_DETECT_NESTED_TYPE`| | | | | | | | |
+|`CUB_DEFINE_VECTOR_TYPE`| | | | | | | | |
+|`CUB_DEPRECATED`| | | | | | | | |
+|`CUB_DEVICE_COMPILER`| | | | | | | | |
+|`CUB_DEVICE_COMPILER_CLANG`| | | | | | | | |
+|`CUB_DEVICE_COMPILER_GCC`| | | | | | | | |
+|`CUB_DEVICE_COMPILER_MSVC`| | | | | | | | |
+|`CUB_DEVICE_COMPILER_NVCC`| | | | | | | | |
+|`CUB_DEVICE_COMPILER_UNKNOWN`| | | | | | | | |
+|`CUB_HOST_COMPILER`| | | | | | | | |
+|`CUB_HOST_COMPILER_CLANG`| | | | | | | | |
+|`CUB_HOST_COMPILER_GCC`| | | | | | | | |
+|`CUB_HOST_COMPILER_MSVC`| | | | | | | | |
+|`CUB_HOST_COMPILER_UNKNOWN`| | | | | | | | |
+|`CUB_IGNORE_DEPRECATED_API`| | | | | | | | |
+|`CUB_IGNORE_DEPRECATED_COMPILER`| | | | | | | | |
+|`CUB_IGNORE_DEPRECATED_CPP_11`| | | | | | | | |
+|`CUB_IGNORE_DEPRECATED_CPP_DIALECT`| | | | | | | | |
+|`CUB_IGNORE_DEPRECATED_DIALECT`| | | | | | | | |
 |`CUB_INCLUDE_DEVICE_CODE`| | | | | | | | |
 |`CUB_INCLUDE_HOST_CODE`| | | | | | | | |
 |`CUB_IS_DEVICE_CODE`| | | | | | | | |
 |`CUB_IS_HOST_CODE`| | | | | | | | |
 |`CUB_LOG_SMEM_BANKS`| | | | | | | | |
 |`CUB_LOG_WARP_THREADS`| | | | | | | | |
-|`CUB_MAX`| | | |`CUB_MAX`| | | | |
+|`CUB_MAX`| | | |`CUB_MAX`|4.5.0| | | |
 |`CUB_MAX_DEVICES`| | | | | | | | |
-|`CUB_MIN`| | | |`CUB_MIN`|2.5.0| | | |
+|`CUB_MIN`| | | |`CUB_MIN`|4.5.0| | | |
+|`CUB_MSVC_VERSION`| | | | | | | | |
+|`CUB_MSVC_VERSION_FULL`| | | | | | | | |
 |`CUB_NAMESPACE_BEGIN`| | | |`BEGIN_HIPCUB_NAMESPACE`|2.5.0| | | |
 |`CUB_NAMESPACE_END`| | | |`END_HIPCUB_NAMESPACE`|2.5.0| | | |
 |`CUB_PREFER_CONFLICT_OVER_PADDING`| | | | | | | | |
+|`CUB_PREVENT_MACRO_SUBSTITUTION`| | | | | | | | |
 |`CUB_PTX_ARCH`| | | |`HIPCUB_ARCH`|2.5.0| | | |
 |`CUB_PTX_LOG_SMEM_BANKS`| | | | | | | | |
 |`CUB_PTX_LOG_WARP_THREADS`| | | | | | | | |
@@ -24,15 +55,25 @@
 |`CUB_PTX_SMEM_BANKS`| | | | | | | | |
 |`CUB_PTX_SUBSCRIPTION_FACTOR`| | | | | | | | |
 |`CUB_PTX_WARP_THREADS`| | | |`HIPCUB_WARP_THREADS`|2.5.0| | | |
+|`CUB_QUOTIENT_CEILING`| | | | | | | | |
+|`CUB_QUOTIENT_FLOOR`| | | | | | | | |
+|`CUB_ROUND_DOWN_NEAREST`| | | | | | | | |
+|`CUB_ROUND_UP_NEAREST`| | | | | | | | |
 |`CUB_RUNTIME_ENABLED`| | | | | | | | |
 |`CUB_RUNTIME_FUNCTION`| | | |`HIPCUB_RUNTIME_FUNCTION`|2.5.0| | | |
 |`CUB_SMEM_BANKS`| | | | | | | | |
+|`CUB_STATIC_ASSERT`| | | | | | | | |
 |`CUB_STDERR`| | | |`HIPCUB_STDERR`|2.5.0| | | |
 |`CUB_SUBSCRIPTION_FACTOR`| | | | | | | | |
 |`CUB_USE_COOPERATIVE_GROUPS`| | | | | | | | |
 |`CubDebug`| | | |`HipcubDebug`|2.5.0| | | |
 |`CubDebugExit`| | | | | | | | |
+|`CubVector`| | | | | | | | |
+|`_CUB_ASM_PTR_`| | | | | | | | |
+|`_CUB_ASM_PTR_SIZE_`| | | | | | | | |
 |`_CubLog`| | | |`_HipcubLog`|2.5.0| | | |
+|`__CUB_ALIGN_BYTES`| | | |`__HIPCUB_ALIGN_BYTES`|4.5.0| | | |
+|`__CUB_LP64__`| | | | | | | | |
 
 
 \*A - Added; D - Deprecated; R - Removed; E - Experimental
\ No newline at end of file
diff --git a/doc/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/doc/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
index 976c1df1..1ed82e98 100644
--- a/doc/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
+++ b/doc/markdown/CUDA_Driver_API_functions_supported_by_HIP.md
@@ -1525,7 +1525,7 @@
 |`cuGraphBatchMemOpNodeGetParams`|11.7| | | | | | | |
 |`cuGraphBatchMemOpNodeSetParams`|11.7| | | | | | | |
 |`cuGraphChildGraphNodeGetGraph`|10.0| | |`hipGraphChildGraphNodeGetGraph`|5.0.0| | | |
-|`cuGraphClone`|10.0| | | | | | | |
+|`cuGraphClone`|10.0| | |`hipGraphClone`|5.0.0| | | |
 |`cuGraphCreate`|10.0| | |`hipGraphCreate`|4.3.0| | | |
 |`cuGraphDebugDotPrint`|11.3| | | | | | | |
 |`cuGraphDestroy`|10.0| | |`hipGraphDestroy`|4.3.0| | | |
diff --git a/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md b/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
index ac961553..4878393c 100644
--- a/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
+++ b/doc/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md
@@ -392,26 +392,26 @@
 
 |**CUDA**|**A**|**D**|**R**|**HIP**|**A**|**D**|**R**|**E**|
 |:--|:-:|:-:|:-:|:--|:-:|:-:|:-:|:-:|
-|`cuTexObjectGetTextureDesc`|9.0| | |`hipGetTextureObjectTextureDesc`|1.7.0| | | |
 |`cudaCreateTextureObject`| | | |`hipCreateTextureObject`|1.7.0| | | |
 |`cudaDestroyTextureObject`| | | |`hipDestroyTextureObject`|1.7.0| | | |
 |`cudaGetTextureObjectResourceDesc`| | | |`hipGetTextureObjectResourceDesc`|1.7.0| | | |
 |`cudaGetTextureObjectResourceViewDesc`| | | |`hipGetTextureObjectResourceViewDesc`|1.7.0| | | |
+|`cudaGetTextureObjectTextureDesc`| | | |`hipGetTextureObjectTextureDesc`|1.7.0| | | |
 
 ## **28. Surface Object Management**
 
 |**CUDA**|**A**|**D**|**R**|**HIP**|**A**|**D**|**R**|**E**|
 |:--|:-:|:-:|:-:|:--|:-:|:-:|:-:|:-:|
-|`cudaCreateSurfaceObject`|9.0| | |`hipCreateSurfaceObject`|1.9.0| | | |
-|`cudaDestroySurfaceObject`|9.0| | |`hipDestroySurfaceObject`|1.9.0| | | |
-|`cudaGetSurfaceObjectResourceDesc`|9.0| | | | | | | |
+|`cudaCreateSurfaceObject`| | | |`hipCreateSurfaceObject`|1.9.0| | | |
+|`cudaDestroySurfaceObject`| | | |`hipDestroySurfaceObject`|1.9.0| | | |
+|`cudaGetSurfaceObjectResourceDesc`| | | | | | | | |
 
 ## **29. Version Management**
 
 |**CUDA**|**A**|**D**|**R**|**HIP**|**A**|**D**|**R**|**E**|
 |:--|:-:|:-:|:-:|:--|:-:|:-:|:-:|:-:|
-|`cudaDriverGetVersion`|9.0| | |`hipDriverGetVersion`|1.6.0| | | |
-|`cudaRuntimeGetVersion`|9.0| | |`hipRuntimeGetVersion`|1.6.0| | | |
+|`cudaDriverGetVersion`| | | |`hipDriverGetVersion`|1.6.0| | | |
+|`cudaRuntimeGetVersion`| | | |`hipRuntimeGetVersion`|1.6.0| | | |
 
 ## **30. Graph Management**
 
@@ -437,7 +437,7 @@
 |`cudaGraphAddMemcpyNodeToSymbol`|11.1| | |`hipGraphAddMemcpyNodeToSymbol`|5.0.0| | | |
 |`cudaGraphAddMemsetNode`|10.0| | |`hipGraphAddMemsetNode`|4.3.0| | | |
 |`cudaGraphChildGraphNodeGetGraph`|10.0| | |`hipGraphChildGraphNodeGetGraph`|5.0.0| | | |
-|`cudaGraphClone`|10.0| | | | | | | |
+|`cudaGraphClone`|10.0| | |`hipGraphClone`|5.0.0| | | |
 |`cudaGraphCreate`|10.0| | |`hipGraphCreate`|4.3.0| | | |
 |`cudaGraphDebugDotPrint`|11.3| | | | | | | |
 |`cudaGraphDestroy`|10.0| | |`hipGraphDestroy`|4.3.0| | | |
diff --git a/src/CUDA2HIP.cpp b/src/CUDA2HIP.cpp
index b3f33a1e..1711966c 100644
--- a/src/CUDA2HIP.cpp
+++ b/src/CUDA2HIP.cpp
@@ -35,13 +35,14 @@ const std::map <llvm::StringRef, hipCounter> CUDA_INCLUDE_MAP {
   {"cuda_texture_types.h",      {"hip/hip_texture_types.h",      "", CONV_INCLUDE,             API_RUNTIME, 0}},
   {"texture_fetch_functions.h", {"",                             "", CONV_INCLUDE,             API_RUNTIME, 0}},
   {"vector_types.h",            {"hip/hip_vector_types.h",       "", CONV_INCLUDE,             API_RUNTIME, 0}},
-  {"cuda_profiler_api.h",       {"hip/hip_profile.h",            "", CONV_INCLUDE,             API_RUNTIME, 0}},
+  {"cuda_profiler_api.h",       {"hip/hip_runtime_api.h",        "", CONV_INCLUDE,             API_RUNTIME, 0}},
   {"cooperative_groups.h",      {"hip/hip_cooperative_groups.h", "", CONV_INCLUDE,             API_RUNTIME, 0}},
   // cuComplex includes
   {"cuComplex.h",               {"hip/hip_complex.h",            "", CONV_INCLUDE_CUDA_MAIN_H, API_COMPLEX, 0}},
   // cuBLAS includes
   {"cublas.h",                  {"hipblas.h",                    "rocblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS, 0}},
-  {"cublas_v2.h",               {"hipblas.h",                    "rocblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS, 0}},
+  {"cublas_v2.h",               {"hipblas.h",                    "rocblas.h", CONV_INCLUDE_CUDA_MAIN_V2_H, API_BLAS, 0}},
+  {"cublas_api.h",              {"hipblas.h",                    "rocblas.h", CONV_INCLUDE, API_BLAS, 0}},
   // cuRAND includes
   {"curand.h",                  {"hiprand.h",                    "", CONV_INCLUDE_CUDA_MAIN_H, API_RAND, 0}},
   {"curand_kernel.h",           {"hiprand_kernel.h",             "", CONV_INCLUDE,             API_RAND, 0}},
diff --git a/src/CUDA2HIP_BLAS_API_functions.cpp b/src/CUDA2HIP_BLAS_API_functions.cpp
index fecd2c8c..0f6f5ae9 100644
--- a/src/CUDA2HIP_BLAS_API_functions.cpp
+++ b/src/CUDA2HIP_BLAS_API_functions.cpp
@@ -78,27 +78,30 @@ const std::map<llvm::StringRef, hipCounter> CUDA_BLAS_FUNCTION_MAP {
   {"cublasGetCudartVersion",         {"hipblasGetCudartVersion",         "",                                         CONV_LIB_FUNC, API_BLAS, 4, UNSUPPORTED}},
 
   // NRM2
-  {"cublasSnrm2",                    {"hipblasSnrm2",                    "rocblas_snrm2",                            CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasDnrm2",                    {"hipblasDnrm2",                    "rocblas_dnrm2",                            CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasScnrm2",                   {"hipblasScnrm2",                   "rocblas_scnrm2",                           CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasDznrm2",                   {"hipblasDznrm2",                   "rocblas_dznrm2",                           CONV_LIB_FUNC, API_BLAS, 5}},
+  // NRM2 functions' signatures differ from _v2 ones, hipblas and rocblas NRM2 functions have mapping to NRM2_v2 functions only
+  {"cublasSnrm2",                    {"hipblasSnrm2",                    "rocblas_snrm2",                            CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasDnrm2",                    {"hipblasDnrm2",                    "rocblas_dnrm2",                            CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasScnrm2",                   {"hipblasScnrm2",                   "rocblas_scnrm2",                           CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasDznrm2",                   {"hipblasDznrm2",                   "rocblas_dznrm2",                           CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
   {"cublasNrm2Ex",                   {"hipblasNrm2Ex",                   "rocblas_nrm2_ex",                          CONV_LIB_FUNC, API_BLAS, 5}},
 
   // DOT
-  {"cublasSdot",                     {"hipblasSdot",                     "rocblas_sdot",                             CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasDdot",                     {"hipblasDdot",                     "rocblas_ddot",                             CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasCdotu",                    {"hipblasCdotu",                    "rocblas_cdotu",                            CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasCdotc",                    {"hipblasCdotc",                    "rocblas_cdotc",                            CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasZdotu",                    {"hipblasZdotu",                    "rocblas_zdotu",                            CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasZdotc",                    {"hipblasZdotc",                    "rocblas_zdotc",                            CONV_LIB_FUNC, API_BLAS, 5}},
+  // DOT functions' signatures differ from _v2 ones, hipblas and rocblas DOT functions have mapping to DOT_v2 functions only
+  {"cublasSdot",                     {"hipblasSdot",                     "rocblas_sdot",                             CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasDdot",                     {"hipblasDdot",                     "rocblas_ddot",                             CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasCdotu",                    {"hipblasCdotu",                    "rocblas_cdotu",                            CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasCdotc",                    {"hipblasCdotc",                    "rocblas_cdotc",                            CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasZdotu",                    {"hipblasZdotu",                    "rocblas_zdotu",                            CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasZdotc",                    {"hipblasZdotc",                    "rocblas_zdotc",                            CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
 
   // SCAL
-  {"cublasSscal",                    {"hipblasSscal",                    "rocblas_sscal",                            CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasDscal",                    {"hipblasDscal",                    "rocblas_dscal",                            CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasCscal",                    {"hipblasCscal",                    "rocblas_cscal",                            CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasCsscal",                   {"hipblasCsscal",                   "rocblas_csscal",                           CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasZscal",                    {"hipblasZscal",                    "rocblas_zscal",                            CONV_LIB_FUNC, API_BLAS, 5}},
-  {"cublasZdscal",                   {"hipblasZdscal",                   "rocblas_zdscal",                           CONV_LIB_FUNC, API_BLAS, 5}},
+  // SCAL functions' signatures differ from _v2 ones, hipblas and rocblas SCAL functions have mapping to SCAL_v2 functions only
+  {"cublasSscal",                    {"hipblasSscal",                    "rocblas_sscal",                            CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasDscal",                    {"hipblasDscal",                    "rocblas_dscal",                            CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasCscal",                    {"hipblasCscal",                    "rocblas_cscal",                            CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasCsscal",                   {"hipblasCsscal",                   "rocblas_csscal",                           CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasZscal",                    {"hipblasZscal",                    "rocblas_zscal",                            CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
+  {"cublasZdscal",                   {"hipblasZdscal",                   "rocblas_zdscal",                           CONV_LIB_FUNC, API_BLAS, 5, HIP_SUPPORTED_V2_ONLY}},
 
   // AXPY
   {"cublasSaxpy",                    {"hipblasSaxpy",                    "rocblas_saxpy",                            CONV_LIB_FUNC, API_BLAS, 5}},
diff --git a/src/CUDA2HIP_BLAS_API_types.cpp b/src/CUDA2HIP_BLAS_API_types.cpp
index 3cd4caf1..997183f1 100644
--- a/src/CUDA2HIP_BLAS_API_types.cpp
+++ b/src/CUDA2HIP_BLAS_API_types.cpp
@@ -37,7 +37,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_BLAS_TYPE_NAME_MAP {
   {"CUBLAS_OP_T",                    {"HIPBLAS_OP_T",                    "rocblas_operation_transpose",           CONV_NUMERIC_LITERAL, API_BLAS, 2}},
   {"CUBLAS_OP_C",                    {"HIPBLAS_OP_C",                    "rocblas_operation_conjugate_transpose", CONV_NUMERIC_LITERAL, API_BLAS, 2}},
   {"CUBLAS_OP_HERMITAN",             {"HIPBLAS_OP_C",                    "rocblas_operation_conjugate_transpose", CONV_NUMERIC_LITERAL, API_BLAS, 2}},
-  {"CUBLAS_OP_CONJG",                {"HIPBLAS_OP_CONJG",                "rocblas_operation_conjugate",           CONV_NUMERIC_LITERAL, API_BLAS, 2, UNSUPPORTED}},
+  {"CUBLAS_OP_CONJG",                {"HIPBLAS_OP_CONJG",                "",                                      CONV_NUMERIC_LITERAL, API_BLAS, 2, UNSUPPORTED}},
 
   // Blas statuses
   {"cublasStatus",                   {"hipblasStatus_t",                 "rocblas_status",                        CONV_TYPE, API_BLAS, 2}},
@@ -205,7 +205,7 @@ const std::map<llvm::StringRef, cudaAPIversions> CUDA_BLAS_TYPE_NAME_VER_MAP {
   {"CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION", {CUDA_110, CUDA_0, CUDA_0}},
   {"cublasGemmAlgo_t",                                 {CUDA_80,  CUDA_0, CUDA_0}},
   {"CUBLAS_GEMM_DFALT",                                {CUDA_80,  CUDA_0, CUDA_0}},
-  {"CUBLAS_GEMM_DEFAULT",                              {CUDA_80,  CUDA_0, CUDA_0}},
+  {"CUBLAS_GEMM_DEFAULT",                              {CUDA_90,  CUDA_0, CUDA_0}},
   {"CUBLAS_GEMM_ALGO0",                                {CUDA_80,  CUDA_0, CUDA_0}},
   {"CUBLAS_GEMM_ALGO1",                                {CUDA_80,  CUDA_0, CUDA_0}},
   {"CUBLAS_GEMM_ALGO2",                                {CUDA_80,  CUDA_0, CUDA_0}},
diff --git a/src/CUDA2HIP_CUB_API_types.cpp b/src/CUDA2HIP_CUB_API_types.cpp
index ef5f406f..e3650eba 100644
--- a/src/CUDA2HIP_CUB_API_types.cpp
+++ b/src/CUDA2HIP_CUB_API_types.cpp
@@ -29,6 +29,9 @@ const std::map<llvm::StringRef, hipCounter> CUDA_CUB_NAMESPACE_MAP {
 
 // Maps the names of CUDA CUB API types to the corresponding HIP types
 const std::map<llvm::StringRef, hipCounter> CUDA_CUB_TYPE_NAME_MAP {
+  // 1. Structs
+  {"CubVector",                                {"HipcubVector",                              "", CONV_TYPE, API_CUB, 1, HIP_UNSUPPORTED}},
+
   // 5. Defines
   {"CUB_STDERR",                               {"HIPCUB_STDERR",                             "", CONV_DEFINE, API_CUB, 1}},
   {"CubDebug",                                 {"HipcubDebug",                               "", CONV_DEFINE, API_CUB, 1}},
@@ -59,6 +62,48 @@ const std::map<llvm::StringRef, hipCounter> CUDA_CUB_TYPE_NAME_MAP {
   {"CUB_PTX_PREFER_CONFLICT_OVER_PADDING",     {"HIPCUB_PREFER_CONFLICT_OVER_PADDING",       "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
   {"CUB_MAX",                                  {"CUB_MAX",                                   "", CONV_DEFINE, API_CUB, 1}},
   {"CUB_MIN",                                  {"CUB_MIN",                                   "", CONV_DEFINE, API_CUB, 1}},
+  {"__CUB_ALIGN_BYTES",                        {"__HIPCUB_ALIGN_BYTES",                      "", CONV_DEFINE, API_CUB, 1}},
+  {"CUB_DEFINE_VECTOR_TYPE",                   {"HIPCUB_DEFINE_VECTOR_TYPE",                 "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_DEFINE_DETECT_NESTED_TYPE",            {"HIPCUB_DEFINE_DETECT_NESTED_TYPE",          "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"__CUB_LP64__",                             {"__HIPCUB_LP64__",                           "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"_CUB_ASM_PTR_",                            {"_HIPCUB_ASM_PTR_",                          "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"_CUB_ASM_PTR_SIZE_",                       {"_HIPCUB_ASM_PTR_SIZE_",                     "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_USE_COOPERATIVE_GROUPS",               {"HIPCUB_USE_COOPERATIVE_GROUPS",             "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_ALIGN",                                {"HIPCUB_ALIGN",                              "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_PREVENT_MACRO_SUBSTITUTION",           {"HIPCUB_PREVENT_MACRO_SUBSTITUTION",         "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_QUOTIENT_FLOOR",                       {"HIPCUB_QUOTIENT_FLOOR",                     "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_QUOTIENT_CEILING",                     {"HIPCUB_QUOTIENT_CEILING",                   "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_ROUND_UP_NEAREST",                     {"HIPCUB_ROUND_UP_NEAREST",                   "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_ROUND_DOWN_NEAREST",                   {"HIPCUB_ROUND_DOWN_NEAREST",                 "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_STATIC_ASSERT",                        {"HIPCUB_STATIC_ASSERT",                      "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_CAT",                                  {"HIPCUB_CAT",                                "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_CAT_",                                 {"HIPCUB_CAT_",                               "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_IGNORE_DEPRECATED_API",                {"HIPCUB_IGNORE_DEPRECATED_API",              "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_DEPRECATED",                           {"HIPCUB_DEPRECATED",                         "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_HOST_COMPILER",                        {"HIPCUB_HOST_COMPILER",                      "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_HOST_COMPILER_MSVC",                   {"HIPCUB_HOST_COMPILER_MSVC",                 "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_HOST_COMPILER_CLANG",                  {"HIPCUB_HOST_COMPILER_CLANG",                "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_HOST_COMPILER_GCC",                    {"HIPCUB_HOST_COMPILER_GCC",                  "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_HOST_COMPILER_UNKNOWN",                {"HIPCUB_HOST_COMPILER_UNKNOWN",              "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_DEVICE_COMPILER",                      {"HIPCUB_DEVICE_COMPILER",                    "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_DEVICE_COMPILER_MSVC",                 {"HIPCUB_DEVICE_COMPILER_MSVC",               "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_DEVICE_COMPILER_CLANG",                {"HIPCUB_DEVICE_COMPILER_CLANG",              "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_DEVICE_COMPILER_GCC",                  {"HIPCUB_DEVICE_COMPILER_GCC",                "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_DEVICE_COMPILER_NVCC",                 {"HIPCUB_DEVICE_COMPILER_NVCC",               "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_DEVICE_COMPILER_UNKNOWN",              {"HIPCUB_DEVICE_COMPILER_UNKNOWN",            "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_IGNORE_DEPRECATED_DIALECT",            {"HIPCUB_IGNORE_DEPRECATED_DIALECT",          "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_IGNORE_DEPRECATED_CPP_DIALECT",        {"HIPCUB_IGNORE_DEPRECATED_CPP_DIALECT",      "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_IGNORE_DEPRECATED_CPP_11",             {"HIPCUB_IGNORE_DEPRECATED_CPP_11",           "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_IGNORE_DEPRECATED_CPP_11",             {"HIPCUB_IGNORE_DEPRECATED_CPP_11",           "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_IGNORE_DEPRECATED_COMPILER",           {"HIPCUB_IGNORE_DEPRECATED_COMPILER",         "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_MSVC_VERSION",                         {"HIPCUB_MSVC_VERSION",                       "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_MSVC_VERSION_FULL",                    {"HIPCUB_MSVC_VERSION_FULL",                  "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_CPLUSPLUS",                            {"HIPCUB_CPLUSPLUS",                          "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_COMP_DEPR_IMPL",                       {"HIPCUB_COMP_DEPR_IMPL",                     "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_COMP_DEPR_IMPL0",                      {"HIPCUB_COMP_DEPR_IMPL0",                    "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_COMP_DEPR_IMPL1",                      {"HIPCUB_COMP_DEPR_IMPL1",                    "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_COMPILER_DEPRECATION",                 {"HIPCUB_COMPILER_DEPRECATION",               "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
+  {"CUB_COMPILER_DEPRECATION_SOFT",            {"HIPCUB_COMPILER_DEPRECATION_SOFT",          "", CONV_DEFINE, API_CUB, 1, HIP_UNSUPPORTED}},
 };
 
 const std::map<llvm::StringRef, cudaAPIversions> CUDA_CUB_TYPE_NAME_VER_MAP {
@@ -73,7 +118,9 @@ const std::map<llvm::StringRef, hipAPIversions> HIP_CUB_TYPE_NAME_VER_MAP {
   {"HIPCUB_ARCH",                            {HIP_2050, HIP_0,    HIP_0   }},
   {"BEGIN_HIPCUB_NAMESPACE",                 {HIP_2050, HIP_0,    HIP_0   }},
   {"END_HIPCUB_NAMESPACE",                   {HIP_2050, HIP_0,    HIP_0   }},
-  {"CUB_MIN",                                {HIP_2050, HIP_0,    HIP_0   }},
+  {"CUB_MAX",                                {HIP_4050, HIP_0,    HIP_0   }},
+  {"CUB_MIN",                                {HIP_4050, HIP_0,    HIP_0   }},
+  {"__HIPCUB_ALIGN_BYTES",                   {HIP_4050, HIP_0,    HIP_0   }},
 };
 
 const std::map<unsigned int, llvm::StringRef> CUDA_CUB_API_SECTION_MAP {
diff --git a/src/CUDA2HIP_Driver_API_functions.cpp b/src/CUDA2HIP_Driver_API_functions.cpp
index a3f4d2b3..81016ddc 100644
--- a/src/CUDA2HIP_Driver_API_functions.cpp
+++ b/src/CUDA2HIP_Driver_API_functions.cpp
@@ -591,7 +591,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DRIVER_FUNCTION_MAP {
   // cudaGraphChildGraphNodeGetGraph
   {"cuGraphChildGraphNodeGetGraph",                        {"hipGraphChildGraphNodeGetGraph",                          "", CONV_GRAPH, API_DRIVER, 21}},
   // cudaGraphClone
-  {"cuGraphClone",                                         {"hipGraphClone",                                           "", CONV_GRAPH, API_DRIVER, 21, HIP_UNSUPPORTED}},
+  {"cuGraphClone",                                         {"hipGraphClone",                                           "", CONV_GRAPH, API_DRIVER, 21}},
   // cudaGraphCreate
   {"cuGraphCreate",                                        {"hipGraphCreate",                                          "", CONV_GRAPH, API_DRIVER, 21}},
   // cudaGraphDebugDotPrint
diff --git a/src/CUDA2HIP_Runtime_API_functions.cpp b/src/CUDA2HIP_Runtime_API_functions.cpp
index 3c525c73..85f60896 100644
--- a/src/CUDA2HIP_Runtime_API_functions.cpp
+++ b/src/CUDA2HIP_Runtime_API_functions.cpp
@@ -640,8 +640,8 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_FUNCTION_MAP {
   // cuTexObjectGetResourceViewDesc
   {"cudaGetTextureObjectResourceViewDesc",                    {"hipGetTextureObjectResourceViewDesc",                    "", CONV_TEXTURE, API_RUNTIME, 27}},
   // no analogue
-  // NOTE: Not equal to cudaGetTextureObjectTextureDesc due to different signatures
-  {"cuTexObjectGetTextureDesc",                               {"hipGetTextureObjectTextureDesc",                         "", CONV_TEXTURE, API_RUNTIME, 27}},
+  // NOTE: Not equal to cuTexObjectGetTextureDesc due to different signatures
+  {"cudaGetTextureObjectTextureDesc",                         {"hipGetTextureObjectTextureDesc",                         "", CONV_TEXTURE, API_RUNTIME, 27}},
 
   // 28. Surface Object Management
   // no analogue
@@ -683,7 +683,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_FUNCTION_MAP {
   // cuGraphChildGraphNodeGetGraph
   {"cudaGraphChildGraphNodeGetGraph",                         {"hipGraphChildGraphNodeGetGraph",                         "", CONV_GRAPH, API_RUNTIME, 30}},
   // cuGraphClone
-  {"cudaGraphClone",                                          {"hipGraphClone",                                          "", CONV_GRAPH, API_RUNTIME, 30, HIP_UNSUPPORTED}},
+  {"cudaGraphClone",                                          {"hipGraphClone",                                          "", CONV_GRAPH, API_RUNTIME, 30}},
   // cuGraphCreate
   {"cudaGraphCreate",                                         {"hipGraphCreate",                                         "", CONV_GRAPH, API_RUNTIME, 30}},
   // cuGraphDebugDotPrint
@@ -833,7 +833,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_RUNTIME_FUNCTION_MAP {
   // cuGraphInstantiateWithFlags
   {"cudaGraphInstantiateWithFlags",                           {"hipGraphInstantiateWithFlags",                           "", CONV_GRAPH, API_RUNTIME, 30}},
   // cuGraphNodeSetEnabled
-  {"cudaGraphNodeSetEnabled",                                {"hipGraphNodeSetEnabled",                                  "", CONV_GRAPH, API_RUNTIME, 30, HIP_UNSUPPORTED}},
+  {"cudaGraphNodeSetEnabled",                                 {"hipGraphNodeSetEnabled",                                 "", CONV_GRAPH, API_RUNTIME, 30, HIP_UNSUPPORTED}},
 
   // 31. Driver Entry Point Access
   // cuGetProcAddress
@@ -963,12 +963,6 @@ const std::map<llvm::StringRef, cudaAPIversions> CUDA_RUNTIME_FUNCTION_VER_MAP {
   {"cudaUnbindTexture",                                       {CUDA_0,   CUDA_110, CUDA_0  }},
   {"cudaBindSurfaceToArray",                                  {CUDA_0,   CUDA_110, CUDA_0  }},
   {"cudaGetSurfaceReference",                                 {CUDA_0,   CUDA_110, CUDA_0  }},
-  {"cuTexObjectGetTextureDesc",                               {CUDA_90,  CUDA_0,   CUDA_0  }},
-  {"cudaCreateSurfaceObject",                                 {CUDA_90,  CUDA_0,   CUDA_0  }},
-  {"cudaDestroySurfaceObject",                                {CUDA_90,  CUDA_0,   CUDA_0  }},
-  {"cudaGetSurfaceObjectResourceDesc",                        {CUDA_90,  CUDA_0,   CUDA_0  }},
-  {"cudaDriverGetVersion",                                    {CUDA_90,  CUDA_0,   CUDA_0  }},
-  {"cudaRuntimeGetVersion",                                   {CUDA_90,  CUDA_0,   CUDA_0  }},
   {"cudaGraphAddChildGraphNode",                              {CUDA_100, CUDA_0,   CUDA_0  }},
   {"cudaGraphAddDependencies",                                {CUDA_100, CUDA_0,   CUDA_0  }},
   {"cudaGraphAddEmptyNode",                                   {CUDA_100, CUDA_0,   CUDA_0  }},
@@ -1288,6 +1282,7 @@ const std::map<llvm::StringRef, hipAPIversions> HIP_RUNTIME_FUNCTION_VER_MAP {
   {"hipGraphEventWaitNodeGetEvent",                           {HIP_5000, HIP_0,    HIP_0   }},
   {"hipGraphEventWaitNodeSetEvent",                           {HIP_5000, HIP_0,    HIP_0   }},
   {"hipGraphExecEventWaitNodeSetEvent",                       {HIP_5000, HIP_0,    HIP_0   }},
+  {"hipGraphClone",                                           {HIP_5000, HIP_0,    HIP_0   }},
   {"hipDeviceGetDefaultMemPool",                              {HIP_5020, HIP_0,    HIP_0,  HIP_LATEST}},
   {"hipDeviceSetMemPool",                                     {HIP_5020, HIP_0,    HIP_0,  HIP_LATEST}},
   {"hipDeviceGetMemPool",                                     {HIP_5020, HIP_0,    HIP_0,  HIP_LATEST}},
diff --git a/src/HipifyAction.cpp b/src/HipifyAction.cpp
index fb58ac87..ae02a4dd 100644
--- a/src/HipifyAction.cpp
+++ b/src/HipifyAction.cpp
@@ -173,27 +173,38 @@ void HipifyAction::FindAndReplace(StringRef name,
   }
   Statistics::current().incrementCounter(found->second, name.str());
   clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics();
-  // Warn the user about deprecated idenrifier.
+  // Warn about the deprecated identifier in CUDA but hipify it.
   if (Statistics::isDeprecated(found->second)) {
-    DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier is deprecated."));
+    const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "'%0' is deprecated in CUDA.");
+    DE.Report(sl, ID) << found->first;
   }
-  // Warn the user about unsupported experimental identifier.
+  // Warn about the unsupported experimental identifier.
   if (Statistics::isHipExperimental(found->second) &&!Experimental) {
     std::string sWarn;
     Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP;
     sWarn = "" + sWarn;
-    const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier is experimental in %0. To hipify it, use the '--experimental' option.");
-    DE.Report(sl, ID) << sWarn;
+    const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "'%0' is experimental in '%1'; to hipify it, use the '--experimental' option.");
+    DE.Report(sl, ID) << found->first << sWarn;
     return;
   }
-
-  // Warn the user about unsupported identifier.
+  // Warn about the identifier which is supported only for _v2 version of it
+  // [NOTE]: Currently, only cuBlas is tracked for versioning and only for _v2;
+  // cublas_v2.h has to be included in the source cuda file for hipification.
+  if (Statistics::isHipSupportedV2Only(found->second) && found->second.apiType == API_BLAS && !insertedBLASHeader_V2) {
+    std::string sWarn;
+    Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP;
+    sWarn = "" + sWarn;
+    const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Only '%0_v2' version of '%0' is supported in '%1'; to hipify it, include 'cublas_v2.h' in the source.");
+    DE.Report(sl, ID) << found->first << sWarn;
+    return;
+  }
+  // Warn about the unsupported identifier.
   if (Statistics::isUnsupported(found->second)) {
     std::string sWarn;
     Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP;
     sWarn = "" + sWarn;
-    const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier is unsupported in %0.");
-    DE.Report(sl, ID) << sWarn;
+    const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "'%0' is unsupported in '%1'.");
+    DE.Report(sl, ID) << found->first << sWarn;
     return;
   }
   if (!bReplace) {
@@ -288,6 +299,18 @@ bool HipifyAction::Exclude(const hipCounter &hipToken) {
         default:
           return false;
       }
+      return false;
+    case CONV_INCLUDE_CUDA_MAIN_V2_H:
+      switch (hipToken.apiType) {
+        case API_BLAS:
+          if (insertedBLASHeader_V2) return true;
+          insertedBLASHeader_V2 = true;
+          if (insertedBLASHeader) return true;
+          return false;
+        default:
+          return false;
+      }
+      return false;
     case CONV_INCLUDE:
       if (hipToken.hipName.empty()) return true;
       switch (hipToken.apiType) {
@@ -300,6 +323,7 @@ bool HipifyAction::Exclude(const hipCounter &hipToken) {
         default:
           return false;
       }
+      return false;
     default:
       return false;
   }
@@ -325,7 +349,10 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
   clang::SourceLocation sl = filename_range.getBegin();
   if (Statistics::isUnsupported(found->second)) {
     clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics();
-    DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header."));
+    std::string sWarn;
+    Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP;
+    const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "'%0' is unsupported header in '%1'.");
+    DE.Report(sl, ID) << found->first << sWarn;
     return;
   }
   clang::StringRef newInclude;
@@ -543,8 +570,8 @@ bool HipifyAction::cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result)
       switch (c.second.castWarn) {
         case cw_DataLoss: {
           clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
-          const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Possible data loss in %0 argument.");
-          DE.Report(fullSL, ID) << argNum+1;
+          const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Possible data loss in %0 argument of '%1'.");
+          DE.Report(fullSL, ID) << argNum+1 << sName;
           break;
         }
         case cw_None:
@@ -698,10 +725,10 @@ class PPCallbackProxy : public clang::PPCallbacks {
 
 public:
   explicit PPCallbackProxy(HipifyAction &action): hipifyAction(action) {}
-  // [ToDo] Remove SWDEV_331863 related guards from CMakeLists.txt and HipifyAction.cpp when the blocker SWDEV_331863 is overcome
+
   void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token,
                           StringRef file_name, bool is_angled, clang::CharSourceRange filename_range,
-#if (LLVM_VERSION_MAJOR < 15) || (LLVM_VERSION_MAJOR == 15 && SWDEV_331863)
+#if LLVM_VERSION_MAJOR < 15
                           const clang::FileEntry *file,
 #else
                           Optional<clang::FileEntryRef> file,
@@ -712,7 +739,7 @@ class PPCallbackProxy : public clang::PPCallbacks {
                         , clang::SrcMgr::CharacteristicKind FileType
 #endif
                          ) override {
-#if (LLVM_VERSION_MAJOR < 15) || (LLVM_VERSION_MAJOR == 15 && SWDEV_331863)
+#if LLVM_VERSION_MAJOR < 15
     auto f = file;
 #else
     auto f = &file->getFileEntry();
@@ -737,6 +764,10 @@ bool HipifyAction::BeginInvocation(clang::CompilerInstance &CI) {
 
 void HipifyAction::ExecuteAction() {
   clang::Preprocessor &PP = getCompilerInstance().getPreprocessor();
+  // Register yourself as the preprocessor callback, by proxy.
+  PP.addPPCallbacks(std::unique_ptr<PPCallbackProxy>(new PPCallbackProxy(*this)));
+  // Now we're done futzing with the lexer, have the subclass proceeed with Sema and AST matching.
+  clang::ASTFrontendAction::ExecuteAction();
   auto &SM = getCompilerInstance().getSourceManager();
   // Start lexing the specified input file.
   llcompat::Memory_Buffer FromFile = llcompat::getMemoryBuffer(SM);
@@ -751,10 +782,6 @@ void HipifyAction::ExecuteAction() {
     RewriteToken(RawTok);
     RawLex.LexFromRawLexer(RawTok);
   }
-  // Register yourself as the preprocessor callback, by proxy.
-  PP.addPPCallbacks(std::unique_ptr<PPCallbackProxy>(new PPCallbackProxy(*this)));
-  // Now we're done futzing with the lexer, have the subclass proceeed with Sema and AST matching.
-  clang::ASTFrontendAction::ExecuteAction();
 }
 
 void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) {
diff --git a/src/HipifyAction.h b/src/HipifyAction.h
index 02813f54..7fbf7134 100644
--- a/src/HipifyAction.h
+++ b/src/HipifyAction.h
@@ -49,6 +49,7 @@ class HipifyAction : public clang::ASTFrontendAction,
   // This approach means we do the best it's possible to do w.r.t preserving the user's include order.
   bool insertedRuntimeHeader = false;
   bool insertedBLASHeader = false;
+  bool insertedBLASHeader_V2 = false;
   bool insertedRANDHeader = false;
   bool insertedRAND_kernelHeader = false;
   bool insertedDNNHeader = false;
diff --git a/src/Statistics.cpp b/src/Statistics.cpp
index 6afa60ba..ade47baf 100644
--- a/src/Statistics.cpp
+++ b/src/Statistics.cpp
@@ -64,6 +64,7 @@ const char *counterNames[NUM_CONV_TYPES] = {
   "device_function", // CONV_DEVICE_FUNC
   "include", // CONV_INCLUDE
   "include_cuda_main_header", // CONV_INCLUDE_CUDA_MAIN_H
+  "include_cuda_main_header_v2", // CONV_INCLUDE_CUDA_MAIN_V2_H
   "type", // CONV_TYPE
   "literal", // CONV_LITERAL
   "numeric_literal", // CONV_NUMERIC_LITERAL
@@ -108,6 +109,7 @@ const char *counterTypes[NUM_CONV_TYPES] = {
   "CONV_LIB_DEVICE_FUNC",
   "CONV_INCLUDE",
   "CONV_INCLUDE_CUDA_MAIN_H",
+  "CONV_INCLUDE_CUDA_MAIN_V2_H",
   "CONV_TYPE",
   "CONV_LITERAL",
   "CONV_NUMERIC_LITERAL",
@@ -399,6 +401,10 @@ bool Statistics::isRemoved(const hipCounter &counter) {
          HIP_REMOVED == (counter.supportDegree & HIP_REMOVED));
 }
 
+bool Statistics::isHipSupportedV2Only(const hipCounter& counter) {
+  return HIP_SUPPORTED_V2_ONLY == (counter.supportDegree & HIP_SUPPORTED_V2_ONLY);
+}
+
 std::string Statistics::getCudaVersion(const cudaVersions& ver) {
   switch (ver) {
     case CUDA_0:
diff --git a/src/Statistics.h b/src/Statistics.h
index 259bc0bf..66b0c3a4 100644
--- a/src/Statistics.h
+++ b/src/Statistics.h
@@ -121,6 +121,7 @@ enum ConvTypes {
   CONV_DEVICE_FUNC,
   CONV_INCLUDE,
   CONV_INCLUDE_CUDA_MAIN_H,
+  CONV_INCLUDE_CUDA_MAIN_V2_H,
   CONV_TYPE,
   CONV_LITERAL,
   CONV_NUMERIC_LITERAL,
@@ -158,7 +159,8 @@ enum SupportDegree {
   CUDA_REMOVED = 0x40,
   HIP_REMOVED = 0x80,
   REMOVED = 0x100,
-  HIP_EXPERIMENTAL = 0x200
+  HIP_EXPERIMENTAL = 0x200,
+  HIP_SUPPORTED_V2_ONLY = 0x400
 };
 
 enum cudaVersions {
@@ -411,6 +413,8 @@ class Statistics {
   static bool isHipRemoved(const hipCounter& counter);
   // Check whether the counter is REMOVED or not.
   static bool isRemoved(const hipCounter& counter);
+  // Check whether the counter is HIP_SUPPORTED_V2_ONLY or not.
+  static bool isHipSupportedV2Only(const hipCounter& counter);
   // Get string CUDA version.
   static std::string getCudaVersion(const cudaVersions &ver);
   // Get string HIP version.
diff --git a/tests/unit_tests/headers/headers_test_07.cu b/tests/unit_tests/headers/headers_test_07.cu
index 1effc189..c125522c 100644
--- a/tests/unit_tests/headers/headers_test_07.cu
+++ b/tests/unit_tests/headers/headers_test_07.cu
@@ -1,8 +1,11 @@
 // RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
 
+// CHECK: #include <hip/hip_runtime.h>
 // CHECK: #include "hipblas.h"
 // CHECK-NOT: #include "cublas.h"
+// CHECK-NOT: #include "cublas_v2.h"
 // CHECK: #include <stdio.h>
-#include "cublas_v2.h"
 #include "cublas.h"
+#include "cublas_v2.h"
+// CHECK-NOT: #include "hipblas.h"
 #include <stdio.h>
diff --git a/tests/unit_tests/headers/headers_test_08.cu b/tests/unit_tests/headers/headers_test_08.cu
index aca7f194..84677d57 100644
--- a/tests/unit_tests/headers/headers_test_08.cu
+++ b/tests/unit_tests/headers/headers_test_08.cu
@@ -1,14 +1,18 @@
 // RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
 
 // CHECK: #include <hip/hip_runtime.h>
-// CHECK-NOT: #include <cuda_runtime.h>
 // CHECK: #include <iostream>
 // CHECK: #include "hipblas.h"
 // CHECK-NOT: #include "cublas.h"
+// CHECK-NOT: #include "cublas_v2.h"
 // CHECK: #include <stdio.h>
+// CHECK-NOT: #include <cuda.h>
 #include <cuda.h>
+// CHECK-NOT: #include <cuda_runtime.h>
 #include <cuda_runtime.h>
+// CHECK-NOT: #include <hip/hip_runtime.h>
 #include <iostream>
-#include "cublas_v2.h"
 #include "cublas.h"
+#include "cublas_v2.h"
+// CHECK-NOT: #include "hipblas.h"
 #include <stdio.h>
diff --git a/tests/unit_tests/headers/headers_test_09.cu b/tests/unit_tests/headers/headers_test_09.cu
index 37e718b5..067fe179 100644
--- a/tests/unit_tests/headers/headers_test_09.cu
+++ b/tests/unit_tests/headers/headers_test_09.cu
@@ -19,6 +19,7 @@
 
 // CHECK: #include "hipblas.h"
 // CHECK-NOT: #include "cublas.h"
+// CHECK-NOT: #include "cublas_v2.h"
 
 // CHECK: #include <stdio.h>
 
@@ -51,10 +52,12 @@
 // CHECK: #include "hipsparse.h"
 
 #include <cuda.h>
+// CHECK-NOT: #include <hip/hip_runtime.h>
 
 #include <memory>
 
 #include <cuda_runtime.h>
+// CHECK-NOT: #include <hip/hip_runtime.h>
 
 #include "cuda_runtime_api.h"
 #include "channel_descriptor.h"
@@ -67,8 +70,9 @@
 
 #include <iostream>
 
-#include "cublas_v2.h"
 #include "cublas.h"
+#include "cublas_v2.h"
+// CHECK-NOT: #include "hipblas.h"
 
 #include <stdio.h>
 
diff --git a/tests/unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu b/tests/unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu
index 69812c98..4d8cadf7 100644
--- a/tests/unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu
+++ b/tests/unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu
@@ -5,16 +5,25 @@
 #include <stdlib.h>
 #include <math.h>
 // CHECK: #include "hipblas.h"
+// CHECK-NOT: #include "cublas_v2.h"
 #include "cublas.h"
+#include "cublas_v2.h"
+// CHECK-NOT: #include "hipblas.h"
 #define M 6
 #define N 5
 #define IDX2C(i,j,ld) (((j)*(ld))+(i))
 static __inline__ void modify(float *m, int ldm, int n, int p, int q, float
   alpha, float beta) {
-  // CHECK: hipblasSscal(n - p, alpha, &m[IDX2C(p, q, ldm)], ldm);
-  // CHECK: hipblasSscal(ldm - p, beta, &m[IDX2C(p, q, ldm)], 1);
-  cublasSscal(n - p, alpha, &m[IDX2C(p, q, ldm)], ldm);
-  cublasSscal(ldm - p, beta, &m[IDX2C(p, q, ldm)], 1);
+  // CHECK: hipblasHandle_t blasHandle;
+  cublasHandle_t blasHandle;
+  // CHECK: hipblasStatus_t blasStatus = hipblasCreate(&blasHandle);
+  cublasStatus blasStatus = cublasCreate(&blasHandle);
+  // CHECK: hipblasSscal(blasHandle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm);
+  cublasSscal(blasHandle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm);
+  // CHECK: hipblasSscal(blasHandle, ldm - p, &beta, &m[IDX2C(p, q, ldm)], 1);
+  cublasSscal(blasHandle, ldm - p, &beta, &m[IDX2C(p, q, ldm)], 1);
+  // CHECK: hipblasDestroy(blasHandle);
+  cublasDestroy(blasHandle);
 }
 int main(void) {
   int i, j;
diff --git a/tests/unit_tests/libraries/cuBLAS/cublas_v1.cu b/tests/unit_tests/libraries/cuBLAS/cublas_v1.cu
new file mode 100644
index 00000000..74a3645a
--- /dev/null
+++ b/tests/unit_tests/libraries/cuBLAS/cublas_v1.cu
@@ -0,0 +1,83 @@
+// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args
+
+// CHECK: #include <hip/hip_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+// CHECK: #include "hipblas.h"
+#include "cublas.h"
+#define M 6
+#define N 5
+#define IDX2C(i,j,ld) (((j)*(ld))+(i))
+static __inline__ void modify(float* m, int ldm, int n, int p, int q, float
+  alpha, float beta) {
+  // CHECK-NOT: hipblasSscal(n - p, alpha, &m[IDX2C(p, q, ldm)], ldm);
+  // CHECK-NOT: hipblasSscal(ldm - p, beta, &m[IDX2C(p, q, ldm)], 1);
+  // CHECK: cublasSscal(n - p, alpha, &m[IDX2C(p, q, ldm)], ldm);
+  // CHECK: cublasSscal(ldm - p, beta, &m[IDX2C(p, q, ldm)], 1);
+  cublasSscal(n - p, alpha, &m[IDX2C(p, q, ldm)], ldm);
+  cublasSscal(ldm - p, beta, &m[IDX2C(p, q, ldm)], 1);
+}
+int main(void) {
+  int i, j;
+  // CHECK: hipblasStatus_t stat;
+  cublasStatus stat;
+  float* devPtrA;
+  float* a = 0;
+  a = (float*)malloc(M * N * sizeof(*a));
+  if (!a) {
+    printf("host memory allocation failed");
+    return EXIT_FAILURE;
+  }
+  for (j = 0; j < N; j++) {
+    for (i = 0; i < M; i++) {
+      a[IDX2C(i, j, M)] = (float)(i * M + j + 1);
+    }
+  }
+  // cublasInit is not supported yet
+  cublasInit();
+  // cublasAlloc is not supported yet
+  stat = cublasAlloc(M * N, sizeof(*a), (void**)&devPtrA);
+  // CHECK: if (stat != HIPBLAS_STATUS_SUCCESS) {
+  if (stat != CUBLAS_STATUS_SUCCESS) {
+    printf("device memory allocation failed");
+    // cublasShutdown is not supported yet
+    cublasShutdown();
+    return EXIT_FAILURE;
+  }
+  // CHECK: stat = hipblasSetMatrix(M, N, sizeof(*a), a, M, devPtrA, M);
+  stat = cublasSetMatrix(M, N, sizeof(*a), a, M, devPtrA, M);
+  // CHECK: if (stat != HIPBLAS_STATUS_SUCCESS) {
+  if (stat != CUBLAS_STATUS_SUCCESS) {
+    printf("data download failed");
+    // cublasFree is not supported yet
+    cublasFree(devPtrA);
+    // cublasShutdown is not supported yet
+    cublasShutdown();
+    return EXIT_FAILURE;
+  }
+  modify(devPtrA, M, N, 1, 2, 16.0f, 12.0f);
+  // CHECK: stat = hipblasGetMatrix(M, N, sizeof(*a), devPtrA, M, a, M);
+  stat = cublasGetMatrix(M, N, sizeof(*a), devPtrA, M, a, M);
+  // CHECK: if (stat != HIPBLAS_STATUS_SUCCESS) {
+  if (stat != CUBLAS_STATUS_SUCCESS) {
+    printf("data upload failed");
+    // cublasFree is not supported yet
+    cublasFree(devPtrA);
+    // cublasShutdown is not supported yet
+    cublasShutdown();
+    return EXIT_FAILURE;
+  }
+  // cublasFree is not supported yet
+  cublasFree(devPtrA);
+  // cublasShutdown is not supported yet
+  cublasShutdown();
+  for (j = 0; j < N; j++) {
+    for (i = 0; i < M; i++) {
+      printf("%7.0f", a[IDX2C(i, j, M)]);
+    }
+    printf("\n");
+  }
+  free(a);
+  return EXIT_SUCCESS;
+}
diff --git a/tests/unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu b/tests/unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu
index 2c014cb0..a47561bd 100644
--- a/tests/unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu
+++ b/tests/unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu
@@ -5,16 +5,25 @@
 #include <stdlib.h>
 #include <math.h>
 // CHECK: #include "rocblas.h"
+// CHECK-NOT: #include "cublas_v2.h"
 #include "cublas.h"
+#include "cublas_v2.h"
+// CHECK-NOT: #include "rocblas.h"
 #define M 6
 #define N 5
 #define IDX2C(i,j,ld) (((j)*(ld))+(i))
 static __inline__ void modify(float *m, int ldm, int n, int p, int q, float
   alpha, float beta) {
-  // CHECK: rocblas_sscal(n - p, alpha, &m[IDX2C(p, q, ldm)], ldm);
-  // CHECK: rocblas_sscal(ldm - p, beta, &m[IDX2C(p, q, ldm)], 1);
-  cublasSscal(n - p, alpha, &m[IDX2C(p, q, ldm)], ldm);
-  cublasSscal(ldm - p, beta, &m[IDX2C(p, q, ldm)], 1);
+  // CHECK: rocblas_handle blasHandle;
+  cublasHandle_t blasHandle;
+  // CHECK: rocblas_status blasStatus = rocblas_create_handle(&blasHandle);
+  cublasStatus blasStatus = cublasCreate(&blasHandle);
+  // CHECK: rocblas_sscal(blasHandle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm);
+  // CHECK: rocblas_sscal(blasHandle, ldm - p, &beta, &m[IDX2C(p, q, ldm)], 1);
+  cublasSscal(blasHandle, n - p, &alpha, &m[IDX2C(p, q, ldm)], ldm);
+  cublasSscal(blasHandle, ldm - p, &beta, &m[IDX2C(p, q, ldm)], 1);
+  // CHECK: rocblas_destroy_handle(blasHandle);
+  cublasDestroy(blasHandle);
 }
 int main(void) {
   int i, j;
diff --git a/tests/unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp b/tests/unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp
index 7b3e4649..0fc567d4 100644
--- a/tests/unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp
+++ b/tests/unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 
 // CHECK: #include <hip/hip_runtime.h>
 #include <cuda.h>
-// CHECK: #include <hip/hip_profile.h>
+// CHECK: #include <hip/hip_runtime_api.h>
 #include <cuda_profiler_api.h>
 
 #define WIDTH 1024
diff --git a/tests/unit_tests/synthetic/driver_functions.cu b/tests/unit_tests/synthetic/driver_functions.cu
index 7a2b6b5c..7c557999 100644
--- a/tests/unit_tests/synthetic/driver_functions.cu
+++ b/tests/unit_tests/synthetic/driver_functions.cu
@@ -1031,6 +1031,11 @@ int main() {
   // HIP: hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph);
   // CHECK: result = hipGraphChildGraphNodeGetGraph(graphNode, &graph);
   result = cuGraphChildGraphNodeGetGraph(graphNode, &graph);
+
+  // CUDA: CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph);
+  // HIP: hipError_t hipGraphClone(hipGraph_t* pGraphClone, hipGraph_t originalGraph);
+  // CHECK: result = hipGraphClone(&graph, graph2);
+  result = cuGraphClone(&graph, graph2);
 #endif
 
 #if CUDA_VERSION > 10000
@@ -1536,12 +1541,10 @@ int main() {
   // CHECK: result = hipTexObjectGetResourceViewDesc(&res_view_descr, texObject);
   result = cuTexObjectGetResourceViewDesc(&res_view_descr, texObject);
 
-#if CUDA_VERSION >= 9000
   // CUDA: CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject);
   // HIP: hipError_t hipTexObjectGetTextureDesc(HIP_TEXTURE_DESC* pTexDesc, hipTextureObject_t texObject);
   // CHECK: result = hipTexObjectGetTextureDesc(&tex_descr, texObject);
   result = cuTexObjectGetTextureDesc(&tex_descr, texObject);
-#endif
 
   // CUDA: CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
   // HIP: DEPRECATED(DEPRECATED_MSG) hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags);
diff --git a/tests/unit_tests/synthetic/libraries/cublas2hipblas.cu b/tests/unit_tests/synthetic/libraries/cublas2hipblas.cu
new file mode 100644
index 00000000..b8bdfd48
--- /dev/null
+++ b/tests/unit_tests/synthetic/libraries/cublas2hipblas.cu
@@ -0,0 +1,399 @@
+// RUN: %run_test hipify "%s" "%t" %hipify_args 2 --skip-excluded-preprocessor-conditional-blocks --experimental %clang_args -D__CUDA_API_VERSION_INTERNAL
+
+// CHECK: #include <hip/hip_runtime.h>
+#include <cuda_runtime.h>
+// CHECK: #include "hipblas.h"
+// CHECK-NOT: #include "cublas_v2.h"
+#include "cublas.h"
+#include "cublas_v2.h"
+// CHECK-NOT: #include "hipblas.h"
+
+int main() {
+  printf("14. cuBLAS API to hipBLAS API synthetic test\n");
+
+  // CHECK: hipblasOperation_t blasOperation;
+  // CHECK-NEXT: hipblasOperation_t BLAS_OP_N = HIPBLAS_OP_N;
+  // CHECK-NEXT: hipblasOperation_t BLAS_OP_T = HIPBLAS_OP_T;
+  // CHECK-NEXT: hipblasOperation_t BLAS_OP_C = HIPBLAS_OP_C;
+  cublasOperation_t blasOperation;
+  cublasOperation_t BLAS_OP_N = CUBLAS_OP_N;
+  cublasOperation_t BLAS_OP_T = CUBLAS_OP_T;
+  cublasOperation_t BLAS_OP_C = CUBLAS_OP_C;
+
+#if CUDA_VERSION >= 10010
+  // CHECK: hipblasOperation_t BLAS_OP_HERMITAN = HIPBLAS_OP_C;
+  cublasOperation_t BLAS_OP_HERMITAN = CUBLAS_OP_HERMITAN;
+#endif
+
+  // CHECK: hipblasStatus_t blasStatus;
+  // CHECK-NEXT: hipblasStatus_t blasStatus_t;
+  // CHECK-NEXT: hipblasStatus_t BLAS_STATUS_SUCCESS = HIPBLAS_STATUS_SUCCESS;
+  // CHECK-NEXT: hipblasStatus_t BLAS_STATUS_NOT_INITIALIZED = HIPBLAS_STATUS_NOT_INITIALIZED;
+  // CHECK-NEXT: hipblasStatus_t BLAS_STATUS_ALLOC_FAILED = HIPBLAS_STATUS_ALLOC_FAILED;
+  // CHECK-NEXT: hipblasStatus_t BLAS_STATUS_INVALID_VALUE = HIPBLAS_STATUS_INVALID_VALUE;
+  // CHECK-NEXT: hipblasStatus_t BLAS_STATUS_MAPPING_ERROR = HIPBLAS_STATUS_MAPPING_ERROR;
+  // CHECK-NEXT: hipblasStatus_t BLAS_STATUS_EXECUTION_FAILED = HIPBLAS_STATUS_EXECUTION_FAILED;
+  // CHECK-NEXT: hipblasStatus_t BLAS_STATUS_INTERNAL_ERROR = HIPBLAS_STATUS_INTERNAL_ERROR;
+  // CHECK-NEXT: hipblasStatus_t BLAS_STATUS_NOT_SUPPORTED = HIPBLAS_STATUS_NOT_SUPPORTED;
+  // CHECK-NEXT: hipblasStatus_t BLAS_STATUS_ARCH_MISMATCH = HIPBLAS_STATUS_ARCH_MISMATCH;
+  cublasStatus blasStatus;
+  cublasStatus_t blasStatus_t;
+  cublasStatus_t BLAS_STATUS_SUCCESS = CUBLAS_STATUS_SUCCESS;
+  cublasStatus_t BLAS_STATUS_NOT_INITIALIZED = CUBLAS_STATUS_NOT_INITIALIZED;
+  cublasStatus_t BLAS_STATUS_ALLOC_FAILED = CUBLAS_STATUS_ALLOC_FAILED;
+  cublasStatus_t BLAS_STATUS_INVALID_VALUE = CUBLAS_STATUS_INVALID_VALUE;
+  cublasStatus_t BLAS_STATUS_MAPPING_ERROR = CUBLAS_STATUS_MAPPING_ERROR;
+  cublasStatus_t BLAS_STATUS_EXECUTION_FAILED = CUBLAS_STATUS_EXECUTION_FAILED;
+  cublasStatus_t BLAS_STATUS_INTERNAL_ERROR = CUBLAS_STATUS_INTERNAL_ERROR;
+  cublasStatus_t BLAS_STATUS_NOT_SUPPORTED = CUBLAS_STATUS_NOT_SUPPORTED;
+  cublasStatus_t BLAS_STATUS_ARCH_MISMATCH = CUBLAS_STATUS_ARCH_MISMATCH;
+
+  // CHECK: hipblasFillMode_t blasFillMode;
+  // CHECK-NEXT: hipblasFillMode_t BLAS_FILL_MODE_LOWER = HIPBLAS_FILL_MODE_LOWER;
+  // CHECK-NEXT: hipblasFillMode_t BLAS_FILL_MODE_UPPER = HIPBLAS_FILL_MODE_UPPER;
+  cublasFillMode_t blasFillMode;
+  cublasFillMode_t BLAS_FILL_MODE_LOWER = CUBLAS_FILL_MODE_LOWER;
+  cublasFillMode_t BLAS_FILL_MODE_UPPER = CUBLAS_FILL_MODE_UPPER;
+
+#if CUDA_VERSION >= 10010
+  // CHECK: hipblasFillMode_t BLAS_FILL_MODE_FULL = HIPBLAS_FILL_MODE_FULL;
+  cublasFillMode_t BLAS_FILL_MODE_FULL = CUBLAS_FILL_MODE_FULL;
+#endif
+
+  // CHECK: hipblasDiagType_t blasDiagType;
+  // CHECK-NEXT: hipblasDiagType_t BLAS_DIAG_NON_UNIT = HIPBLAS_DIAG_NON_UNIT;
+  // CHECK-NEXT: hipblasDiagType_t BLAS_DIAG_UNIT = HIPBLAS_DIAG_UNIT;
+  cublasDiagType_t blasDiagType;
+  cublasDiagType_t BLAS_DIAG_NON_UNIT = CUBLAS_DIAG_NON_UNIT;
+  cublasDiagType_t BLAS_DIAG_UNIT = CUBLAS_DIAG_UNIT;
+
+  // CHECK: hipblasSideMode_t blasSideMode;
+  // CHECK-NEXT: hipblasSideMode_t BLAS_SIDE_LEFT = HIPBLAS_SIDE_LEFT;
+  // CHECK-NEXT: hipblasSideMode_t BLAS_SIDE_RIGHT = HIPBLAS_SIDE_RIGHT;
+  cublasSideMode_t blasSideMode;
+  cublasSideMode_t BLAS_SIDE_LEFT = CUBLAS_SIDE_LEFT;
+  cublasSideMode_t BLAS_SIDE_RIGHT = CUBLAS_SIDE_RIGHT;
+
+  // CHECK: hipblasPointerMode_t blasPointerMode;
+  // CHECK-NEXT: hipblasPointerMode_t BLAS_POINTER_MODE_HOST = HIPBLAS_POINTER_MODE_HOST;
+  // CHECK-NEXT: hipblasPointerMode_t BLAS_POINTER_MODE_DEVICE = HIPBLAS_POINTER_MODE_DEVICE;
+  cublasPointerMode_t blasPointerMode;
+  cublasPointerMode_t BLAS_POINTER_MODE_HOST = CUBLAS_POINTER_MODE_HOST;
+  cublasPointerMode_t BLAS_POINTER_MODE_DEVICE = CUBLAS_POINTER_MODE_DEVICE;
+
+  // CHECK: hipblasAtomicsMode_t blasAtomicsMode;
+  // CHECK-NEXT: hipblasAtomicsMode_t BLAS_ATOMICS_NOT_ALLOWED = HIPBLAS_ATOMICS_NOT_ALLOWED;
+  // CHECK-NEXT: hipblasAtomicsMode_t BLAS_ATOMICS_ALLOWED = HIPBLAS_ATOMICS_ALLOWED;
+  cublasAtomicsMode_t blasAtomicsMode;
+  cublasAtomicsMode_t BLAS_ATOMICS_NOT_ALLOWED = CUBLAS_ATOMICS_NOT_ALLOWED;
+  cublasAtomicsMode_t BLAS_ATOMICS_ALLOWED = CUBLAS_ATOMICS_ALLOWED;
+
+#if CUDA_VERSION >= 8000
+  // CHECK: hipblasDatatype_t DataType;
+  // CHECK-NEXT: hipblasDatatype_t DataType_t;
+  // CHECK-NEXT: hipblasDatatype_t blasDataType;
+  // CHECK-NEXT: hipblasDatatype_t R_16F = HIPBLAS_R_16F;
+  // CHECK-NEXT: hipblasDatatype_t C_16F = HIPBLAS_C_16F;
+  // CHECK-NEXT: hipblasDatatype_t R_32F = HIPBLAS_R_32F;
+  // CHECK-NEXT: hipblasDatatype_t C_32F = HIPBLAS_C_32F;
+  // CHECK-NEXT: hipblasDatatype_t R_64F = HIPBLAS_R_64F;
+  // CHECK-NEXT: hipblasDatatype_t C_64F = HIPBLAS_C_64F;
+  // CHECK-NEXT: hipblasDatatype_t R_8I = HIPBLAS_R_8I;
+  // CHECK-NEXT: hipblasDatatype_t C_8I = HIPBLAS_C_8I;
+  // CHECK-NEXT: hipblasDatatype_t R_8U = HIPBLAS_R_8U;
+  // CHECK-NEXT: hipblasDatatype_t C_8U = HIPBLAS_C_8U;
+  // CHECK-NEXT: hipblasDatatype_t R_32I = HIPBLAS_R_32I;
+  // CHECK-NEXT: hipblasDatatype_t C_32I = HIPBLAS_C_32I;
+  // CHECK-NEXT: hipblasDatatype_t R_32U = HIPBLAS_R_32U;
+  // CHECK-NEXT: hipblasDatatype_t C_32U = HIPBLAS_C_32U;
+  cudaDataType DataType;
+  cudaDataType_t DataType_t;
+  cublasDataType_t blasDataType;
+  cublasDataType_t R_16F = CUDA_R_16F;
+  cublasDataType_t C_16F = CUDA_C_16F;
+  cublasDataType_t R_32F = CUDA_R_32F;
+  cublasDataType_t C_32F = CUDA_C_32F;
+  cublasDataType_t R_64F = CUDA_R_64F;
+  cublasDataType_t C_64F = CUDA_C_64F;
+  cublasDataType_t R_8I = CUDA_R_8I;
+  cublasDataType_t C_8I = CUDA_C_8I;
+  cublasDataType_t R_8U = CUDA_R_8U;
+  cublasDataType_t C_8U = CUDA_C_8U;
+  cublasDataType_t R_32I = CUDA_R_32I;
+  cublasDataType_t C_32I = CUDA_C_32I;
+  cublasDataType_t R_32U = CUDA_R_32U;
+  cublasDataType_t C_32U = CUDA_C_32U;
+#endif
+
+#if CUDA_VERSION >= 11000
+  // CHECK: hipblasDatatype_t R_16BF = HIPBLAS_R_16B;
+  // CHECK-NEXT: hipblasDatatype_t C_16BF = HIPBLAS_C_16B;
+  cublasDataType_t R_16BF = CUDA_R_16BF;
+  cublasDataType_t C_16BF = CUDA_C_16BF;
+#endif
+
+#if CUDA_VERSION >= 8000
+  // CHECK: hipblasGemmAlgo_t blasGemmAlgo;
+  // CHECK-NEXT: hipblasGemmAlgo_t BLAS_GEMM_DFALT = HIPBLAS_GEMM_DEFAULT;
+  cublasGemmAlgo_t blasGemmAlgo;
+  cublasGemmAlgo_t BLAS_GEMM_DFALT = CUBLAS_GEMM_DFALT;
+#endif
+
+#if CUDA_VERSION >= 9000
+  // CHECK: hipblasGemmAlgo_t BLAS_GEMM_DEFAULT = HIPBLAS_GEMM_DEFAULT;
+  cublasGemmAlgo_t BLAS_GEMM_DEFAULT = CUBLAS_GEMM_DEFAULT;
+#endif
+
+  // CHECK: hipblasHandle_t blasHandle;
+  cublasHandle_t blasHandle;
+
+// Functions
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t* mode);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasGetAtomicsMode(hipblasHandle_t handle, hipblasAtomicsMode_t* atomics_mode);
+  // CHECK: blasStatus = hipblasGetAtomicsMode(blasHandle, &blasAtomicsMode);
+  blasStatus = cublasGetAtomicsMode(blasHandle, &blasAtomicsMode);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSetAtomicsMode(hipblasHandle_t handle, hipblasAtomicsMode_t atomics_mode);
+  // CHECK: blasStatus = hipblasSetAtomicsMode(blasHandle, blasAtomicsMode);
+  blasStatus = cublasSetAtomicsMode(blasHandle, blasAtomicsMode);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t* handle);
+  // CUDA: #define cublasCreate cublasCreate_v2
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasCreate(hipblasHandle_t* handle);
+  // CHECK: blasStatus = hipblasCreate(&blasHandle);
+  // CHECK-NEXT: blasStatus = hipblasCreate(&blasHandle);
+  blasStatus = cublasCreate(&blasHandle);
+  blasStatus = cublasCreate_v2(&blasHandle);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);
+  // CUDA: #define cublasDestroy cublasDestroy_v2
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasDestroy(hipblasHandle_t handle);
+  // CHECK: blasStatus = hipblasDestroy(blasHandle);
+  // CHECK-NEXT: blasStatus = hipblasDestroy(blasHandle);
+  blasStatus = cublasDestroy(blasHandle);
+  blasStatus = cublasDestroy_v2(blasHandle);
+
+  // CHECK: hipStream_t stream;
+  cudaStream_t stream;
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId);
+  // CUDA: #define cublasSetStream cublasSetStream_v2
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSetStream(hipblasHandle_t handle, hipStream_t streamId);
+  // CHECK: blasStatus = hipblasSetStream(blasHandle, stream);
+  // CHECK-NEXT: blasStatus = hipblasSetStream(blasHandle, stream);
+  blasStatus = cublasSetStream(blasHandle, stream);
+  blasStatus = cublasSetStream_v2(blasHandle, stream);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle, cudaStream_t* streamId);
+  // CUDA: #define cublasGetStream cublasGetStream_v2
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasGetStream(hipblasHandle_t handle, hipStream_t* streamId);
+  // CHECK: blasStatus = hipblasGetStream(blasHandle, &stream);
+  // CHECK-NEXT: blasStatus = hipblasGetStream(blasHandle, &stream);
+  blasStatus = cublasGetStream(blasHandle, &stream);
+  blasStatus = cublasGetStream_v2(blasHandle, &stream);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t mode);
+  // CUDA: #define cublasSetPointerMode cublasSetPointerMode_v2
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSetPointerMode(hipblasHandle_t handle, hipblasPointerMode_t mode);
+  // CHECK: blasStatus = hipblasSetPointerMode(blasHandle, blasPointerMode);
+  // CHECK-NEXT: blasStatus = hipblasSetPointerMode(blasHandle, blasPointerMode);
+  blasStatus = cublasSetPointerMode(blasHandle, blasPointerMode);
+  blasStatus = cublasSetPointerMode_v2(blasHandle, blasPointerMode);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t* mode);
+  // CUDA: #define cublasGetPointerMode cublasGetPointerMode_v2
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasGetPointerMode(hipblasHandle_t handle, hipblasPointerMode_t* mode);
+  // CHECK: blasStatus = hipblasGetPointerMode(blasHandle, &blasPointerMode);
+  // CHECK-NEXT: blasStatus = hipblasGetPointerMode(blasHandle, &blasPointerMode);
+  blasStatus = cublasGetPointerMode(blasHandle, &blasPointerMode);
+  blasStatus = cublasGetPointerMode_v2(blasHandle, &blasPointerMode);
+
+  int n = 0;
+  int num = 0;
+  int incx = 0;
+  int incy = 0;
+  void* image = nullptr;
+  void* image_2 = nullptr;
+  void* deviceptr = nullptr;
+
+  // CUDA: cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void* x, int incx, void* devicePtr, int incy);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSetVector(int n, int elemSize, const void* x, int incx, void* y, int incy);
+  // CHECK: blasStatus = hipblasSetVector(n, num, image, incx, image_2, incy);
+  blasStatus = cublasSetVector(n, num, image, incx, image_2, incy);
+
+  // CUDA: cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void* x, int incx, void* y, int incy);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasGetVector(int n, int elemSize, const void* x, int incx, void* y, int incy);
+  // CHECK: blasStatus = hipblasGetVector(n, num, image, incx, image_2, incy);
+  blasStatus = cublasGetVector(n, num, image, incx, image_2, incy);
+
+  // CUDA: cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize, const void* hostPtr, int incx, void* devicePtr, int incy, cudaStream_t stream);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSetVectorAsync(int n, int elemSize, const void* x, int incx, void* y, int incy, hipStream_t stream);
+  // CHECK: blasStatus = hipblasSetVectorAsync(n, num, image, incx, image_2, incy, stream);
+  blasStatus = cublasSetVectorAsync(n, num, image, incx, image_2, incy, stream);
+
+  // CUDA: cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize, const void* devicePtr, int incx, void* hostPtr, int incy, cudaStream_t stream);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasGetVectorAsync(int n, int elemSize, const void* x, int incx, void* y, int incy, hipStream_t stream);
+  // CHECK: blasStatus = hipblasGetVectorAsync(n, num, image, incx, image_2, incy, stream);
+  blasStatus = cublasGetVectorAsync(n, num, image, incx, image_2, incy, stream);
+
+  int rows = 0;
+  int cols = 0;
+
+  // CUDA: cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSetMatrix(int rows, int cols, int elemSize, const void* AP, int lda, void* BP, int ldb);
+  // CHECK: blasStatus = hipblasSetMatrix(rows, cols, num, image, incx, image_2, incy);
+  blasStatus = cublasSetMatrix(rows, cols, num, image, incx, image_2, incy);
+
+  // CUDA: cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasGetMatrix(int rows, int cols, int elemSize, const void* AP, int lda, void* BP, int ldb);
+  // CHECK: blasStatus = hipblasGetMatrix(rows, cols, num, image, incx, image_2, incy);
+  blasStatus = cublasGetMatrix(rows, cols, num, image, incx, image_2, incy);
+
+  // CUDA: cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSetMatrixAsync(int rows, int cols, int elemSize, const void* AP, int lda, void* BP, int ldb, hipStream_t stream);
+  // CHECK: blasStatus = hipblasSetMatrixAsync(rows, cols, num, image, incx, image_2, incy, stream);
+  blasStatus = cublasSetMatrixAsync(rows, cols, num, image, incx, image_2, incy, stream);
+
+  // CUDA: cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasGetMatrixAsync(int rows, int cols, int elemSize, const void* AP, int lda, void* BP, int ldb, hipStream_t stream);
+  // CHECK: blasStatus = hipblasGetMatrixAsync(rows, cols, num, image, incx, image_2, incy, stream);
+  blasStatus = cublasGetMatrixAsync(rows, cols, num, image, incx, image_2, incy, stream);
+
+  cudaDataType DataType_2, DataType_3;
+
+  float fx = 0;
+  float fy = 0;
+  float fresult = 0;
+
+  double dx = 0;
+  double dy = 0;
+  double dresult = 0;
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSnrm2(hipblasHandle_t handle, int n, const float* x, int incx, float* result);
+  // CHECK: blasStatus = hipblasSnrm2(blasHandle, n, &fx, incx, &fresult);
+  // CHECK-NEXT: blasStatus = hipblasSnrm2(blasHandle, n, &fx, incx, &fresult);
+  blasStatus = cublasSnrm2(blasHandle, n, &fx, incx, &fresult);
+  blasStatus = cublasSnrm2_v2(blasHandle, n, &fx, incx, &fresult);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasDnrm2(hipblasHandle_t handle, int n, const double* x, int incx, double* result);
+  // CHECK: blasStatus = hipblasDnrm2(blasHandle, n, &dx, incx, &dresult);
+  // CHECK-NEXT: blasStatus = hipblasDnrm2(blasHandle, n, &dx, incx, &dresult);
+  blasStatus = cublasDnrm2(blasHandle, n, &dx, incx, &dresult);
+  blasStatus = cublasDnrm2_v2(blasHandle, n, &dx, incx, &dresult);
+
+  // CHECK: hipComplex complex, complex_2, complex_res;
+  cuComplex complex, complex_2, complex_res;
+  // CHECK: hipDoubleComplex dcomplex, dcomplex_2, dcomplex_res;
+  cuDoubleComplex dcomplex, dcomplex_2, dcomplex_res;
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasScnrm2(hipblasHandle_t handle, int n, const hipblasComplex* x, int incx, float* result);
+  // CHECK: blasStatus = hipblasScnrm2(blasHandle, n, &complex, incx, &fresult);
+  // CHECK-NEXT: blasStatus = hipblasScnrm2(blasHandle, n, &complex, incx, &fresult);
+  blasStatus = cublasScnrm2(blasHandle, n, &complex, incx, &fresult);
+  blasStatus = cublasScnrm2_v2(blasHandle, n, &complex, incx, &fresult);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasDznrm2(hipblasHandle_t handle, int n, const hipblasDoubleComplex* x, int incx, double* result);
+  // CHECK: blasStatus = hipblasDznrm2(blasHandle, n, &dcomplex, incx, &dresult);
+  // CHECK-NEXT: blasStatus = hipblasDznrm2(blasHandle, n, &dcomplex, incx, &dresult);
+  blasStatus = cublasDznrm2(blasHandle, n, &dcomplex, incx, &dresult);
+  blasStatus = cublasDznrm2_v2(blasHandle, n, &dcomplex, incx, &dresult);
+
+#if CUDA_VERSION >= 8000
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, void* result, cudaDataType resultType, cudaDataType executionType);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasNrm2Ex(hipblasHandle_t handle, int n, const void* x, hipblasDatatype_t xType, int incx, void* result, hipblasDatatype_t resultType, hipblasDatatype_t executionType);
+  // CHECK: blasStatus = hipblasNrm2Ex(blasHandle, n, image, DataType, incx, image_2, DataType_2, DataType_3);
+  blasStatus = cublasNrm2Ex(blasHandle, n, image, DataType, incx, image_2, DataType_2, DataType_3);
+#endif
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n, const float* x, int incx, const float* y, int incy, float* result);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSdot(hipblasHandle_t handle, int n, const float* x, int incx, const float* y, int incy, float* result);
+  // CHECK: blasStatus = hipblasSdot(blasHandle, n, &fx, incx, &fy, incy, &fresult);
+  // CHECK-NEXT: blasStatus = hipblasSdot(blasHandle, n, &fx, incx, &fy, incy, &fresult);
+  blasStatus = cublasSdot(blasHandle, n, &fx, incx, &fy, incy, &fresult);
+  blasStatus = cublasSdot_v2(blasHandle, n, &fx, incx, &fy, incy, &fresult);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n, const double* x, int incx, const double* y, int incy, double* result);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasDdot(hipblasHandle_t handle, int n, const double* x, int incx, const double* y, int incy, double* result);
+  // CHECK: blasStatus = hipblasDdot(blasHandle, n, &dx, incx, &dy, incy, &dresult);
+  // CHECK-NEXT: blasStatus = hipblasDdot(blasHandle, n, &dx, incx, &dy, incy, &dresult);
+  blasStatus = cublasDdot(blasHandle, n, &dx, incx, &dy, incy, &dresult);
+  blasStatus = cublasDdot_v2(blasHandle, n, &dx, incx, &dy, incy, &dresult);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* result);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasCdotu(hipblasHandle_t handle, int n, const hipblasComplex* x, int incx, const hipblasComplex* y, int incy, hipblasComplex* result);
+  // CHECK: blasStatus = hipblasCdotu(blasHandle, n, &complex, incx, &complex_2, incy, &complex_res);
+  // CHECK-NEXT: blasStatus = hipblasCdotu(blasHandle, n, &complex, incx, &complex_2, incy, &complex_res);
+  blasStatus = cublasCdotu(blasHandle, n, &complex, incx, &complex_2, incy, &complex_res);
+  blasStatus = cublasCdotu_v2(blasHandle, n, &complex, incx, &complex_2, incy, &complex_res);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* result);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasCdotc(hipblasHandle_t handle, int n, const hipblasComplex* x, int incx, const hipblasComplex* y, int incy, hipblasComplex* result);
+  // CHECK: blasStatus = hipblasCdotc(blasHandle, n, &complex, incx, &complex_2, incy, &complex_res);
+  // CHECK-NEXT: blasStatus = hipblasCdotc(blasHandle, n, &complex, incx, &complex_2, incy, &complex_res);
+  blasStatus = cublasCdotc(blasHandle, n, &complex, incx, &complex_2, incy, &complex_res);
+  blasStatus = cublasCdotc_v2(blasHandle, n, &complex, incx, &complex_2, incy, &complex_res);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* result);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasZdotu(hipblasHandle_t handle, int n, const hipblasDoubleComplex* x, int incx, const hipblasDoubleComplex* y, int incy, hipblasDoubleComplex* result);
+  // CHECK: blasStatus = hipblasZdotu(blasHandle, n, &dcomplex, incx, &dcomplex_2, incy, &dcomplex_res);
+  // CHECK-NEXT: blasStatus = hipblasZdotu(blasHandle, n, &dcomplex, incx, &dcomplex_2, incy, &dcomplex_res);
+  blasStatus = cublasZdotu(blasHandle, n, &dcomplex, incx, &dcomplex_2, incy, &dcomplex_res);
+  blasStatus = cublasZdotu_v2(blasHandle, n, &dcomplex, incx, &dcomplex_2, incy, &dcomplex_res);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy, cuDoubleComplex* result);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasZdotc(hipblasHandle_t handle, int n, const hipblasDoubleComplex* x, int incx, const hipblasDoubleComplex* y, int incy, hipblasDoubleComplex* result);
+  // CHECK: blasStatus = hipblasZdotc(blasHandle, n, &dcomplex, incx, &dcomplex_2, incy, &dcomplex_res);
+  // CHECK-NEXT: blasStatus = hipblasZdotc(blasHandle, n, &dcomplex, incx, &dcomplex_2, incy, &dcomplex_res);
+  blasStatus = cublasZdotc(blasHandle, n, &dcomplex, incx, &dcomplex_2, incy, &dcomplex_res);
+  blasStatus = cublasZdotc_v2(blasHandle, n, &dcomplex, incx, &dcomplex_2, incy, &dcomplex_res);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, int n, const float* alpha, float* x, int incx);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSscal(hipblasHandle_t handle, int n, const float* alpha, float* x, int incx);
+  // CHECK: blasStatus = hipblasSscal(blasHandle, n, &fy, &fx, incx);
+  // CHECK-NEXT: blasStatus = hipblasSscal(blasHandle, n, &fy, &fx, incx);
+  blasStatus = cublasSscal(blasHandle, n, &fy, &fx, incx);
+  blasStatus = cublasSscal_v2(blasHandle, n, &fy, &fx, incx);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, int n, const double* alpha, double* x, int incx);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasDscal(hipblasHandle_t handle, int n, const double* alpha, double* x, int incx);
+  // CHECK: blasStatus = hipblasDscal(blasHandle, n, &dx, &dy, incx);
+  // CHECK-NEXT: blasStatus = hipblasDscal(blasHandle, n, &dx, &dy, incx);
+  blasStatus = cublasDscal(blasHandle, n, &dx, &dy, incx);
+  blasStatus = cublasDscal_v2(blasHandle, n, &dx, &dy, incx);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, int n, const cuComplex* alpha, cuComplex* x, int incx);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasCscal(hipblasHandle_t handle, int n, const hipblasComplex* alpha, hipblasComplex* x, int incx);
+  // CHECK: blasStatus = hipblasCscal(blasHandle, n, &complex, &complex_2, incx);
+  // CHECK-NEXT: blasStatus = hipblasCscal(blasHandle, n, &complex, &complex_2, incx);
+  blasStatus = cublasCscal(blasHandle, n, &complex, &complex_2, incx);
+  blasStatus = cublasCscal_v2(blasHandle, n, &complex, &complex_2, incx);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, int n, const float* alpha, cuComplex* x, int incx);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasCsscal(hipblasHandle_t handle, int n, const float* alpha, hipblasComplex* x, int incx);
+  // CHECK: blasStatus = hipblasCsscal(blasHandle, n, &fx, &complex, incx);
+  // CHECK-NEXT: blasStatus = hipblasCsscal(blasHandle, n, &fx, &complex, incx);
+  blasStatus = cublasCsscal(blasHandle, n, &fx, &complex, incx);
+  blasStatus = cublasCsscal_v2(blasHandle, n, &fx, &complex, incx);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, int n, const cuDoubleComplex* alpha, cuDoubleComplex* x, int incx);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasZscal(hipblasHandle_t handle, int n, const hipblasDoubleComplex* alpha, hipblasDoubleComplex* x, int incx);
+  // CHECK: blasStatus = hipblasZscal(blasHandle, n, &dcomplex, &dcomplex_2, incx);
+  // CHECK-NEXT: blasStatus = hipblasZscal(blasHandle, n, &dcomplex, &dcomplex_2, incx);
+  blasStatus = cublasZscal(blasHandle, n, &dcomplex, &dcomplex_2, incx);
+  blasStatus = cublasZscal_v2(blasHandle, n, &dcomplex, &dcomplex_2, incx);
+
+  // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, int n, const double* alpha, cuDoubleComplex* x, int incx);
+  // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasZdscal(hipblasHandle_t handle, int n, const double* alpha, hipblasDoubleComplex* x, int incx);
+  // CHECK: blasStatus = hipblasZdscal(blasHandle, n, &dx, &dcomplex, incx);
+  // CHECK-NEXT: blasStatus = hipblasZdscal(blasHandle, n, &dx, &dcomplex, incx);
+  blasStatus = cublasZdscal(blasHandle, n, &dx, &dcomplex, incx);
+  blasStatus = cublasZdscal_v2(blasHandle, n, &dx, &dcomplex, incx);
+
+  return 0;
+}
diff --git a/tests/unit_tests/synthetic/runtime_functions.cu b/tests/unit_tests/synthetic/runtime_functions.cu
index 29648536..4e1a3578 100644
--- a/tests/unit_tests/synthetic/runtime_functions.cu
+++ b/tests/unit_tests/synthetic/runtime_functions.cu
@@ -8,7 +8,10 @@
   #include "windows.h"
   #include <GL/glew.h>
 #endif
+
 #include "cuda_gl_interop.h"
+// CHECK: #include "hip/hip_runtime_api.h"
+#include "cuda_profiler_api.h"
 
 int main() {
   printf("12. CUDA Runtime API Functions synthetic test\n");
@@ -35,6 +38,8 @@ int main() {
   void* deviceptr_2 = nullptr;
   void* image = nullptr;
   void* func = nullptr;
+  void* src = nullptr;
+  void* dst = nullptr;
   char* ch = nullptr;
   const char* const_ch = nullptr;
   dim3 gridDim;
@@ -42,6 +47,7 @@ int main() {
   GLuint gl_uint = 0;
   GLenum gl_enum = 0;
   struct textureReference* texref = nullptr;
+  std::string name = "str";
 
 #if defined(_WIN32)
   unsigned long long ull = 0;
@@ -57,6 +63,17 @@ int main() {
   cudaError_t Error_t;
   cudaStream_t stream;
 
+  // CHECK: hipEvent_t Event_t;
+  // CHECK-Next: hipEvent_t Event_2;
+  cudaEvent_t Event_t;
+  cudaEvent_t Event_2;
+
+  // CHECK: hipMemcpy3DParms Memcpy3DParms;
+  cudaMemcpy3DParms Memcpy3DParms;
+
+  // CHECK: hipMemcpyKind MemcpyKind;
+  cudaMemcpyKind MemcpyKind;
+
 #if CUDA_VERSION >= 8000
   // CHECK: hipDeviceP2PAttr DeviceP2PAttr;
   cudaDeviceP2PAttr DeviceP2PAttr;
@@ -133,8 +150,8 @@ int main() {
   // CHECK: result = hipStreamBeginCapture(stream, StreamCaptureMode);
   result = cudaStreamBeginCapture(stream, StreamCaptureMode);
 
-  // CHECK: hipGraph_t Graph_t;
-  cudaGraph_t Graph_t;
+  // CHECK: hipGraph_t Graph_t, Graph_t_2;
+  cudaGraph_t Graph_t, Graph_t_2;
 
   // CUDA: extern __host__ cudaError_t CUDARTAPI cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph);
   // HIP: hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph);
@@ -209,6 +226,121 @@ int main() {
   // HIP: hipError_t hipLaunchHostFunc(hipStream_t stream, hipHostFn_t fn, void* userData);
   // CHECK: result = hipLaunchHostFunc(stream, hostFn, image);
   result = cudaLaunchHostFunc(stream, hostFn, image);
+
+  // CHECK: hipGraphNode_t graphNode, graphNode_2;
+  cudaGraphNode_t graphNode, graphNode_2;
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaGraph_t childGraph);
+  // HIP: hipError_t hipGraphAddChildGraphNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, hipGraph_t childGraph);
+  // CHECK: result = hipGraphAddChildGraphNode(&graphNode, Graph_t, &graphNode_2, bytes, Graph_t_2);
+  result = cudaGraphAddChildGraphNode(&graphNode, Graph_t, &graphNode_2, bytes, Graph_t_2);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies);
+  // HIP: hipError_t hipGraphAddDependencies(hipGraph_t graph, const hipGraphNode_t* from, const hipGraphNode_t* to, size_t numDependencies);
+  // CHECK: result = hipGraphAddDependencies(Graph_t, &graphNode, &graphNode_2, bytes);
+  result = cudaGraphAddDependencies(Graph_t, &graphNode, &graphNode_2, bytes);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies);
+  // HIP: hipError_t hipGraphAddEmptyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies);
+  // CHECK: result = hipGraphAddEmptyNode(&graphNode, Graph_t, &graphNode_2, bytes);
+  result = cudaGraphAddEmptyNode(&graphNode, Graph_t, &graphNode_2, bytes);
+
+  // CHECK: hipHostNodeParams HostNodeParams;
+  cudaHostNodeParams HostNodeParams;
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaHostNodeParams *pNodeParams);
+  // HIP: hipError_t hipGraphAddHostNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, const hipHostNodeParams* pNodeParams);
+  // CHECK: result = hipGraphAddHostNode(&graphNode, Graph_t, &graphNode_2, bytes, &HostNodeParams);
+  result = cudaGraphAddHostNode(&graphNode, Graph_t, &graphNode_2, bytes, &HostNodeParams);
+
+  // CHECK: hipKernelNodeParams KernelNodeParams;
+  cudaKernelNodeParams KernelNodeParams;
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaKernelNodeParams *pNodeParams);
+  // HIP: hipError_t hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, const hipKernelNodeParams* pNodeParams);
+  // CHECK: result = hipGraphAddKernelNode(&graphNode, Graph_t, &graphNode_2, bytes, &KernelNodeParams);
+  result = cudaGraphAddKernelNode(&graphNode, Graph_t, &graphNode_2, bytes, &KernelNodeParams);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemcpy3DParms *pCopyParams);
+  // HIP: hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, const hipMemcpy3DParms* pCopyParams);
+  // CHECK: result = hipGraphAddMemcpyNode(&graphNode, Graph_t, &graphNode_2, bytes, &Memcpy3DParms);
+  result = cudaGraphAddMemcpyNode(&graphNode, Graph_t, &graphNode_2, bytes, &Memcpy3DParms);
+
+  // CHECK: hipMemsetParams MemsetParams;
+  cudaMemsetParams MemsetParams;
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemsetParams *pMemsetParams);
+  // HIP: hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, const hipMemsetParams* pMemsetParams);
+  // CHECK: result = hipGraphAddMemsetNode(&graphNode, Graph_t, &graphNode_2, bytes, &MemsetParams);
+  result = cudaGraphAddMemsetNode(&graphNode, Graph_t, &graphNode_2, bytes, &MemsetParams);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph);
+  // HIP: hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph);
+  // CHECK: result = hipGraphChildGraphNodeGetGraph(graphNode, &Graph_t);
+  result = cudaGraphChildGraphNodeGetGraph(graphNode, &Graph_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph);
+  // HIP: hipError_t hipGraphClone(hipGraph_t* pGraphClone, hipGraph_t originalGraph);
+  // CHECK: result = hipGraphClone(&Graph_t, Graph_t_2);
+  result = cudaGraphClone(&Graph_t, Graph_t_2);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph, unsigned int flags);
+  // HIP: hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags);
+  // CHECK: result = hipGraphCreate(&Graph_t, flags);
+  result = cudaGraphCreate(&Graph_t, flags);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph);
+  // HIP: hipError_t hipGraphDestroy(hipGraph_t graph);
+  // CHECK: result = hipGraphDestroy(Graph_t);
+  result = cudaGraphDestroy(Graph_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphDestroyNode(cudaGraphNode_t node);
+  // HIP: hipError_t hipGraphDestroyNode(hipGraphNode_t node);
+  // CHECK: result = hipGraphDestroyNode(graphNode);
+  result = cudaGraphDestroyNode(graphNode);
+
+  // CHECK: hipGraphExec_t GraphExec_t;
+  cudaGraphExec_t GraphExec_t;
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecDestroy(cudaGraphExec_t graphExec);
+  // HIP: hipError_t hipGraphExecDestroy(hipGraphExec_t graphExec);
+  // CHECK: result = hipGraphExecDestroy(GraphExec_t);
+  result = cudaGraphExecDestroy(GraphExec_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to, size_t *numEdges);
+  // HIP: hipError_t hipGraphGetEdges(hipGraph_t graph, hipGraphNode_t* from, hipGraphNode_t* to, size_t* numEdges);
+  // CHECK: result = hipGraphGetEdges(Graph_t, &graphNode, &graphNode_2, &bytes);
+  result = cudaGraphGetEdges(Graph_t, &graphNode, &graphNode_2, &bytes);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t *nodes, size_t *numNodes);
+  // HIP: hipError_t hipGraphGetNodes(hipGraph_t graph, hipGraphNode_t* nodes, size_t* numNodes);
+  // CHECK: result = hipGraphGetNodes(Graph_t, &graphNode, &bytes);
+  result = cudaGraphGetNodes(Graph_t, &graphNode, &bytes);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes);
+  // HIP: hipError_t hipGraphGetRootNodes(hipGraph_t graph, hipGraphNode_t* pRootNodes, size_t* pNumRootNodes);
+  // CHECK: result = hipGraphGetRootNodes(Graph_t, &graphNode, &bytes);
+  result = cudaGraphGetRootNodes(Graph_t, &graphNode, &bytes);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams);
+  // HIP: hipError_t hipGraphHostNodeGetParams(hipGraphNode_t node, hipHostNodeParams* pNodeParams);
+  // CHECK: result = hipGraphHostNodeGetParams(graphNode, &HostNodeParams);
+  result = cudaGraphHostNodeGetParams(graphNode, &HostNodeParams);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams);
+  // HIP: hipError_t hipGraphHostNodeSetParams(hipGraphNode_t node, const hipHostNodeParams* pNodeParams);
+  // CHECK: result = hipGraphHostNodeSetParams(graphNode, &HostNodeParams);
+  result = cudaGraphHostNodeSetParams(graphNode, &HostNodeParams);
+
+  char* name_ = const_cast<char*>(name.c_str());
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode, char *pLogBuffer, size_t bufferSize);
+  // HIP: hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph, hipGraphNode_t* pErrorNode, char* pLogBuffer, size_t bufferSize);
+  // CHECK: result = hipGraphInstantiate(&GraphExec_t, Graph_t, &graphNode, name_, bytes);
+  result = cudaGraphInstantiate(&GraphExec_t, Graph_t, &graphNode, name_, bytes);
+
+  // CHECK: hipGraphNodeType GraphNodeType;
+  cudaGraphNodeType GraphNodeType;
 #endif
 
 #if CUDA_VERSION >= 10010
@@ -231,13 +363,16 @@ int main() {
   result = cudaThreadExchangeStreamCaptureMode(&streamCaptureMode);
 #endif
 
+#if CUDA_VERSION >= 10020
+  // CHECK: hipGraphExecUpdateResult GraphExecUpdateResult;
+  cudaGraphExecUpdateResult GraphExecUpdateResult;
+#endif
+
 #if CUDA_VERSION >= 11000
   // CHECK: hipKernelNodeAttrID kernelNodeAttrID;
   cudaKernelNodeAttrID kernelNodeAttrID;
   // CHECK: hipKernelNodeAttrValue kernelNodeAttrValue;
   cudaKernelNodeAttrValue kernelNodeAttrValue;
-  // CHECK: hipGraphNode_t graphNode;
-  cudaGraphNode_t graphNode;
 
   // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(cudaGraphNode_t hNode, enum cudaKernelNodeAttrID attr, const union cudaKernelNodeAttrValue* value);
   // HIP: hipError_t hipGraphKernelNodeSetAttribute(hipGraphNode_t hNode, hipKernelNodeAttrID attr, const hipKernelNodeAttrValue* value);
@@ -248,6 +383,183 @@ int main() {
   // HIP: hipError_t hipGraphKernelNodeGetAttribute(hipGraphNode_t hNode, hipKernelNodeAttrID attr, hipKernelNodeAttrValue* value);
   // CHECK: result = hipGraphKernelNodeGetAttribute(graphNode, kernelNodeAttrID, &kernelNodeAttrValue);
   result = cudaGraphKernelNodeGetAttribute(graphNode, kernelNodeAttrID, &kernelNodeAttrValue);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams);
+  // HIP: hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, const hipKernelNodeParams* pNodeParams);
+  // CHECK: result = hipGraphExecKernelNodeSetParams(GraphExec_t, graphNode, &KernelNodeParams);
+  result = cudaGraphExecKernelNodeSetParams(GraphExec_t, graphNode, &KernelNodeParams);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams);
+  // HIP: hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, hipMemcpy3DParms* pNodeParams);
+  // CHECK: result = hipGraphExecMemcpyNodeSetParams(GraphExec_t, graphNode, &Memcpy3DParms);
+  result = cudaGraphExecMemcpyNodeSetParams(GraphExec_t, graphNode, &Memcpy3DParms);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams);
+  // HIP: hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, const hipMemsetParams* pNodeParams);
+  // CHECK: result = hipGraphExecMemsetNodeSetParams(GraphExec_t, graphNode, &MemsetParams);
+  result = cudaGraphExecMemsetNodeSetParams(GraphExec_t, graphNode, &MemsetParams);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams);
+  // HIP: hipError_t hipError_t hipGraphExecHostNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, const hipHostNodeParams* pNodeParams);
+  // CHECK: result = hipGraphExecHostNodeSetParams(GraphExec_t, graphNode, &HostNodeParams);
+  result = cudaGraphExecHostNodeSetParams(GraphExec_t, graphNode, &HostNodeParams);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphNode_t *hErrorNode_out, enum cudaGraphExecUpdateResult *updateResult_out);
+  // HIP: hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_t hGraph, hipGraphNode_t* hErrorNode_out, hipGraphExecUpdateResult* updateResult_out);
+  // CHECK: result = hipGraphExecUpdate(GraphExec_t, Graph_t, &graphNode, &GraphExecUpdateResult);
+  result = cudaGraphExecUpdate(GraphExec_t, Graph_t, &graphNode, &GraphExecUpdateResult);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams);
+  // HIP: hipError_t hipGraphKernelNodeGetParams(hipGraphNode_t node, hipKernelNodeParams* pNodeParams);
+  // CHECK: result = hipGraphKernelNodeGetParams(graphNode, &KernelNodeParams);
+  result = cudaGraphKernelNodeGetParams(graphNode, &KernelNodeParams);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams);
+  // HIP: hipError_t hipGraphKernelNodeSetParams(hipGraphNode_t node, const hipKernelNodeParams* pNodeParams);
+  // CHECK: result = hipGraphKernelNodeSetParams(graphNode, &KernelNodeParams);
+  result = cudaGraphKernelNodeSetParams(graphNode, &KernelNodeParams);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
+  // HIP: hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream);
+  // CHECK: result = hipGraphLaunch(GraphExec_t, stream);
+  result = cudaGraphLaunch(GraphExec_t, stream);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams);
+  // HIP: hipError_t hipGraphMemcpyNodeGetParams(hipGraphNode_t node, hipMemcpy3DParms* pNodeParams);
+  // CHECK: result = hipGraphMemcpyNodeGetParams(graphNode, &Memcpy3DParms);
+  result = cudaGraphMemcpyNodeGetParams(graphNode, &Memcpy3DParms);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams);
+  // HIP: hipError_t hipGraphMemcpyNodeSetParams(hipGraphNode_t node, const hipMemcpy3DParms* pNodeParams);
+  // CHECK: result = hipGraphMemcpyNodeSetParams(graphNode, &Memcpy3DParms);
+  result = cudaGraphMemcpyNodeSetParams(graphNode, &Memcpy3DParms);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams);
+  // HIP: hipError_t hipGraphMemsetNodeGetParams(hipGraphNode_t node, hipMemsetParams* pNodeParams);
+  // CHECK: result = hipGraphMemsetNodeGetParams(graphNode, &MemsetParams);
+  result = cudaGraphMemsetNodeGetParams(graphNode, &MemsetParams);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams);
+  // HIP: hipError_t hipGraphMemsetNodeSetParams(hipGraphNode_t node, const hipMemsetParams* pNodeParams);
+  // CHECK: result = hipGraphMemsetNodeSetParams(graphNode, &MemsetParams);
+  result = cudaGraphMemsetNodeSetParams(graphNode, &MemsetParams);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode, cudaGraph_t clonedGraph);
+  // HIP: hipError_t hipError_t hipGraphNodeFindInClone(hipGraphNode_t* pNode, hipGraphNode_t originalNode, hipGraph_t clonedGraph);
+  // CHECK: result = hipGraphNodeFindInClone(&graphNode, graphNode_2, Graph_t);
+  result = cudaGraphNodeFindInClone(&graphNode, graphNode_2, Graph_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t *pDependencies, size_t *pNumDependencies);
+  // HIP: hipError_t hipGraphNodeGetDependencies(hipGraphNode_t node, hipGraphNode_t* pDependencies, size_t* pNumDependencies);
+  // CHECK: result = hipGraphNodeGetDependencies(graphNode, &graphNode_2, &bytes);
+  result = cudaGraphNodeGetDependencies(graphNode, &graphNode_2, &bytes);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes, size_t *pNumDependentNodes);
+  // HIP: hipError_t hipGraphNodeGetDependentNodes(hipGraphNode_t node, hipGraphNode_t* pDependentNodes, size_t* pNumDependentNodes);
+  // CHECK: result = hipGraphNodeGetDependentNodes(graphNode, &graphNode_2, &bytes);
+  result = cudaGraphNodeGetDependentNodes(graphNode, &graphNode_2, &bytes);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType);
+  // HIP: hipError_t hipGraphNodeGetType(hipGraphNode_t node, hipGraphNodeType* pType);
+  // CHECK: result = hipGraphNodeGetType(graphNode, &GraphNodeType);
+  result = cudaGraphNodeGetType(graphNode, &GraphNodeType);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies);
+  // HIP: hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* from, const hipGraphNode_t* to, size_t numDependencies);
+  // CHECK: result = hipGraphRemoveDependencies(Graph_t, &graphNode, &graphNode, bytes);
+  result = cudaGraphRemoveDependencies(Graph_t, &graphNode, &graphNode, bytes);
+#endif
+
+#if CUDA_VERSION >= 11010
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeToSymbol(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind);
+  // HIP: hipError_t hipGraphAddMemcpyNodeToSymbol(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, const void* symbol, const void* src, size_t count, size_t offset, hipMemcpyKind kind);
+  // CHECK: result = hipGraphAddMemcpyNodeToSymbol(&graphNode, Graph_t, &graphNode_2, width, HIP_SYMBOL(image), src, bytes, wOffset, MemcpyKind);
+  result = cudaGraphAddMemcpyNodeToSymbol(&graphNode, Graph_t, &graphNode_2, width, image, src, bytes, wOffset, MemcpyKind);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeFromSymbol(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, void* dst, const void* symbol, size_t count, size_t offset, enum cudaMemcpyKind kind);
+  // HIP: hipError_t hipGraphAddMemcpyNodeFromSymbol(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, void* dst, const void* symbol, size_t count, size_t offset, hipMemcpyKind kind);
+  // CHECK: result = hipGraphAddMemcpyNodeFromSymbol(&graphNode, Graph_t, &graphNode_2, width, dst, HIP_SYMBOL(image), bytes, wOffset, MemcpyKind);
+  result = cudaGraphAddMemcpyNodeFromSymbol(&graphNode, Graph_t, &graphNode_2, width, dst, image, bytes, wOffset, MemcpyKind);
+
+  // CUDA:  extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode1D(cudaGraphNode_t* pGraphNode, cudaGraph_t graph, const cudaGraphNode_t* pDependencies, size_t numDependencies, void* dst, const void* src, size_t count, enum cudaMemcpyKind kind);
+  // HIP: hipError_t hipGraphAddMemcpyNode1D(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, void* dst, const void* src, size_t count, hipMemcpyKind kind);
+  // CHECK: result = hipGraphAddMemcpyNode1D(&graphNode, Graph_t, &graphNode_2, width, dst, src, bytes, MemcpyKind);
+  result = cudaGraphAddMemcpyNode1D(&graphNode, Graph_t, &graphNode_2, width, dst, src, bytes, MemcpyKind);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsToSymbol(cudaGraphNode_t node, const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind);
+  // HIP: hipError_t hipGraphMemcpyNodeSetParamsToSymbol(hipGraphNode_t node, const void* symbol, const void* src, size_t count, size_t offset, hipMemcpyKind kind);
+  // CHECK: result = hipGraphMemcpyNodeSetParamsToSymbol(graphNode, HIP_SYMBOL(image), src, bytes, wOffset, MemcpyKind);
+  result = cudaGraphMemcpyNodeSetParamsToSymbol(graphNode, image, src, bytes, wOffset, MemcpyKind);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsFromSymbol(cudaGraphNode_t node, void* dst, const void* symbol, size_t count, size_t offset, enum cudaMemcpyKind kind);
+  // HIP: hipError_t hipGraphMemcpyNodeSetParamsFromSymbol(hipGraphNode_t node, void* dst, const void* symbol, size_t count, size_t offset, hipMemcpyKind kind);
+  // CHECK: result = hipGraphMemcpyNodeSetParamsFromSymbol(graphNode, dst, HIP_SYMBOL(image), bytes, wOffset, MemcpyKind);
+  result = cudaGraphMemcpyNodeSetParamsFromSymbol(graphNode, dst, image, bytes, wOffset, MemcpyKind);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams1D(cudaGraphNode_t node, void* dst, const void* src, size_t count, enum cudaMemcpyKind kind);
+  // HIP: hipError_t hipGraphMemcpyNodeSetParams1D(hipGraphNode_t node, void* dst, const void* src, size_t count, hipMemcpyKind kind);
+  // CHECK: result = hipGraphMemcpyNodeSetParams1D(graphNode, dst, src, bytes, MemcpyKind);
+  result = cudaGraphMemcpyNodeSetParams1D(graphNode, dst, src, bytes, MemcpyKind);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event);
+  // HIP: hipError_t hipGraphAddEventRecordNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, hipEvent_t event);
+  // CHECK: result = hipGraphAddEventRecordNode(&graphNode, Graph_t, &graphNode_2, bytes, Event_t);
+  result = cudaGraphAddEventRecordNode(&graphNode, Graph_t, &graphNode_2, bytes, Event_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out);
+  // HIP: hipError_t hipGraphEventRecordNodeGetEvent(hipGraphNode_t node, hipEvent_t* event_out);
+  // CHECK: result = hipGraphEventRecordNodeGetEvent(graphNode, &Event_t);
+  result = cudaGraphEventRecordNodeGetEvent(graphNode, &Event_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event);
+  // HIP: hipError_t hipGraphEventRecordNodeSetEvent(hipGraphNode_t node, hipEvent_t event);
+  // CHECK: result = hipGraphEventRecordNodeSetEvent(graphNode, Event_t);
+  result = cudaGraphEventRecordNodeSetEvent(graphNode, Event_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event);
+  // HIP: hipError_t hipGraphAddEventWaitNode(hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies, size_t numDependencies, hipEvent_t event);
+  // CHECK: result = hipGraphAddEventWaitNode(&graphNode, Graph_t, &graphNode_2, bytes, Event_t);
+  result = cudaGraphAddEventWaitNode(&graphNode, Graph_t, &graphNode_2, bytes, Event_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out);
+  // HIP: hipError_t hipGraphEventWaitNodeGetEvent(hipGraphNode_t node, hipEvent_t* event_out);
+  // CHECK: result = hipGraphEventWaitNodeGetEvent(graphNode, &Event_t);
+  result = cudaGraphEventWaitNodeGetEvent(graphNode, &Event_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event);
+  // HIP: hipError_t hipGraphEventWaitNodeSetEvent(hipGraphNode_t node, hipEvent_t event);
+  // CHECK: result = hipGraphEventWaitNodeSetEvent(graphNode, Event_t);
+  result = cudaGraphEventWaitNodeSetEvent(graphNode, Event_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsToSymbol(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const void* symbol, const void* src, size_t count, size_t offset, enum cudaMemcpyKind kind);
+  // HIP: hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(hipGraphExec_t hGraphExec, hipGraphNode_t node, const void* symbol, const void* src, size_t count, size_t offset, hipMemcpyKind kind);
+  // CHECK: result = hipGraphExecMemcpyNodeSetParamsToSymbol(GraphExec_t, graphNode, HIP_SYMBOL(image), src, bytes, wOffset, MemcpyKind);
+  result = cudaGraphExecMemcpyNodeSetParamsToSymbol(GraphExec_t, graphNode, image, src, bytes, wOffset, MemcpyKind);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsFromSymbol(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, void* dst, const void* symbol, size_t count, size_t offset, enum cudaMemcpyKind kind);
+  // HIP: hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec, hipGraphNode_t node, void* dst, const void* symbol, size_t count, size_t offset, hipMemcpyKind kind);
+  // CHECK: result = hipGraphExecMemcpyNodeSetParamsFromSymbol(GraphExec_t, graphNode, dst, HIP_SYMBOL(image), bytes, wOffset, MemcpyKind);
+  result = cudaGraphExecMemcpyNodeSetParamsFromSymbol(GraphExec_t, graphNode, dst, image, bytes, wOffset, MemcpyKind);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams1D(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, void* dst, const void* src, size_t count, enum cudaMemcpyKind kind);
+  // HIP: hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec, hipGraphNode_t node, void* dst, const void* src, size_t count, hipMemcpyKind kind);
+  // CHECK: result = hipGraphExecMemcpyNodeSetParams1D(GraphExec_t, graphNode, dst, src, bytes, MemcpyKind);
+  result = cudaGraphExecMemcpyNodeSetParams1D(GraphExec_t, graphNode, dst, src, bytes, MemcpyKind);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecChildGraphNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph);
+  // HIP: hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNode_t node, hipGraph_t childGraph);
+  // CHECK: result = hipGraphExecChildGraphNodeSetParams(GraphExec_t, graphNode, Graph_t);
+  result = cudaGraphExecChildGraphNodeSetParams(GraphExec_t, graphNode, Graph_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventRecordNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event);
+  // HIP: hipError_t hipGraphExecEventRecordNodeSetEvent(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, hipEvent_t event);
+  // CHECK: result = hipGraphExecEventRecordNodeSetEvent(GraphExec_t, graphNode, Event_t);
+  result = cudaGraphExecEventRecordNodeSetEvent(GraphExec_t, graphNode, Event_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventWaitNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event);
+  // HIP: hipError_t hipGraphExecEventWaitNodeSetEvent(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, hipEvent_t event);
+  // CHECK: result = hipGraphExecEventWaitNodeSetEvent(GraphExec_t, graphNode, Event_t);
+  result = cudaGraphExecEventWaitNodeSetEvent(GraphExec_t, graphNode, Event_t);
 #endif
 
 #if CUDA_VERSION >= 11020
@@ -355,6 +667,13 @@ int main() {
   result = cudaMemPoolImportPointer(&deviceptr, memPool_t, &memPoolPtrExportData);
 #endif
 
+#if CUDA_VERSION >= 11040
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithFlags(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags);
+  // HIP: hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph, unsigned long long flags);
+  // CHECK: result = hipGraphInstantiateWithFlags(&GraphExec_t, Graph_t, ull);
+  result = cudaGraphInstantiateWithFlags(&GraphExec_t, Graph_t, ull);
+#endif
+
   // CHECK: hipDeviceProp_t DeviceProp;
   cudaDeviceProp DeviceProp;
 
@@ -458,11 +777,6 @@ int main() {
   // CHECK: hipIpcEventHandle_t IpcEventHandle_t;
   cudaIpcEventHandle_t IpcEventHandle_t;
 
-  // CHECK: hipEvent_t Event_t;
-  // CHECK-Next: hipEvent_t Event_2;
-  cudaEvent_t Event_t;
-  cudaEvent_t Event_2;
-
   // CUDA: extern __host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event);
   // HIP: hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event);
   // CHECK: result = hipIpcGetEventHandle(&IpcEventHandle_t, Event_t);
@@ -795,9 +1109,6 @@ int main() {
   // CHECK: result = hipMallocPitch(&deviceptr, &bytes, width, height);
   result = cudaMallocPitch(&deviceptr, &bytes, width, height);
 
-  // CHECK: hipMemcpyKind MemcpyKind;
-  cudaMemcpyKind MemcpyKind;
-
   // CUDA: extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
   // HIP: hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
   // CHECK: result = hipMemcpy(deviceptr, deviceptr_2, bytes, MemcpyKind);
@@ -833,9 +1144,6 @@ int main() {
   // CHECK: result = hipMemcpy2DToArrayAsync(Array_t, wOffset, hOffset, deviceptr_2, pitch, width, height, MemcpyKind, stream);
   result = cudaMemcpy2DToArrayAsync(Array_t, wOffset, hOffset, deviceptr_2, pitch, width, height, MemcpyKind, stream);
 
-  // CHECK: hipMemcpy3DParms Memcpy3DParms;
-  cudaMemcpy3DParms Memcpy3DParms;
-
   // CUDA: extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
   // HIP: hipError_t hipMemcpy3D(const struct hipMemcpy3DParms* p);
   // CHECK: result = hipMemcpy3D(&Memcpy3DParms);
@@ -1058,5 +1366,92 @@ int main() {
   // CHECK: result = hipUnbindTexture(texref);
   result = cudaUnbindTexture(texref);
 
+  // CHECK: hipTextureObject_t TextureObject_t;
+  cudaTextureObject_t TextureObject_t;
+
+  // CHECK: hipResourceDesc ResourceDesc;
+  cudaResourceDesc ResourceDesc;
+
+  // CHECK: hipTextureDesc TextureDesc;
+  cudaTextureDesc TextureDesc;
+
+  // CHECK: hipResourceViewDesc ResourceViewDesc;
+  cudaResourceViewDesc ResourceViewDesc;
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc);
+  // HIP:  hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject, const hipResourceDesc* pResDesc, const hipTextureDesc* pTexDesc, const struct hipResourceViewDesc* pResViewDesc);
+  // CHECK: result = hipCreateTextureObject(&TextureObject_t, &ResourceDesc, &TextureDesc, &ResourceViewDesc);
+  result = cudaCreateTextureObject(&TextureObject_t, &ResourceDesc, &TextureDesc, &ResourceViewDesc);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject);
+  // HIP:  hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject);
+  // CHECK: result = hipDestroyTextureObject(TextureObject_t);
+  result = cudaDestroyTextureObject(TextureObject_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject);
+  // HIP:  hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc, hipTextureObject_t textureObject);
+  // CHECK: result = hipGetTextureObjectResourceDesc(&ResourceDesc, TextureObject_t);
+  result = cudaGetTextureObjectResourceDesc(&ResourceDesc, TextureObject_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject);
+  // HIP:  hipError_t hipGetTextureObjectResourceViewDesc(struct hipResourceViewDesc* pResViewDesc, hipTextureObject_t textureObject);
+  // CHECK: result = hipGetTextureObjectResourceViewDesc(&ResourceViewDesc, TextureObject_t);
+  result = cudaGetTextureObjectResourceViewDesc(&ResourceViewDesc, TextureObject_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject);
+  // HIP:  hipError_t hipGetTextureObjectTextureDesc(hipTextureDesc* pTexDesc, hipTextureObject_t textureObject);
+  // CHECK: result = hipGetTextureObjectTextureDesc(&TextureDesc, TextureObject_t);
+  result = cudaGetTextureObjectTextureDesc(&TextureDesc, TextureObject_t);
+
+  // CHECK: hipSurfaceObject_t SurfaceObject_t;
+  cudaSurfaceObject_t SurfaceObject_t;
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc);
+  // HIP:  hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject, const hipResourceDesc* pResDesc);
+  // CHECK: result = hipCreateSurfaceObject(&SurfaceObject_t, &ResourceDesc);
+  result = cudaCreateSurfaceObject(&SurfaceObject_t, &ResourceDesc);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject);
+  // HIP:  hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject);
+  // CHECK: result = hipDestroySurfaceObject(SurfaceObject_t);
+  result = cudaDestroySurfaceObject(SurfaceObject_t);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion);
+  // HIP:  hipError_t hipDriverGetVersion(int* driverVersion);
+  // CHECK: result = hipDriverGetVersion(&intVal);
+  result = cudaDriverGetVersion(&intVal);
+
+  // CUDA: extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
+  // HIP:  hipError_t hipRuntimeGetVersion(int* runtimeVersion);
+  // CHECK: result = hipRuntimeGetVersion(&intVal);
+  result = cudaRuntimeGetVersion(&intVal);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaProfilerStart(void);
+  // HIP:  hipError_t hipProfilerStart();
+  // CHECK: result = hipProfilerStart();
+  result = cudaProfilerStart();
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaProfilerStop(void);
+  // HIP:  hipError_t hipProfilerStop();
+  // CHECK: result = hipProfilerStop();
+  result = cudaProfilerStop();
+
+#if CUDA_VERSION <= 10000
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0));
+  // HIP:  hipError_t hipConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dparm(0), hipStream_t stream __dparm(0));
+  // CHECK: result = hipConfigureCall(gridDim, blockDim, bytes, stream);
+  result = cudaConfigureCall(gridDim, blockDim, bytes, stream);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func);
+  // HIP:  hipError_t hipLaunchByPtr(const void* func);
+  // CHECK: result = hipLaunchByPtr(deviceptr);
+  result = cudaLaunch(deviceptr);
+
+  // CUDA: extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, size_t offset);
+  // HIP:  hipError_t hipSetupArgument(const void* arg, size_t size, size_t offset);
+  // CHECK: result = hipSetupArgument(deviceptr, bytes, wOffset);
+  result = cudaSetupArgument(deviceptr, bytes, wOffset);
+#endif
+
   return 0;
 }

11.7.0	11.7.1	works only with the patch due to the clang's bug 54609 patch for 14.0.0 patch for 14.0.1 @@ -171,7 +171,7 @@ After applying all the matchers, the output HIP source is produced.
14.0.5, 14.0.6	11.7.0	11.7.1	LATEST STABLE CONFIG