NVIDIA · miscco · Feb 26, 2026 · Feb 26, 2026
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/std/complex>
+#include <cuda/stream>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42});
+  const auto count            = cuda::std::count(cuda::execution::__cub_par_unseq, in.begin(), in.end(), T{42});
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements - count);
+
+  caching_allocator_t alloc{};
+
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               cuda::std::remove(cuda_policy(alloc, launch), in.begin(), in.end(), T{42});
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/std/complex>
+#include <cuda/stream>
+
+#include "nvbench_helper.cuh"
+
+struct is_even
+{
+  template <class T>
+  __device__ constexpr bool operator()(const T& val) const noexcept
+  {
+    return static_cast<int>(val) % 2 == 0;
+  }
+
+  __device__ constexpr bool operator()(const complex& val) const noexcept
+  {
+    return static_cast<int>(val.real()) % 2 == 0;
+  }
+};
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
+
+  thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42});
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(elements);
+  state.add_global_memory_writes<T>(elements / 2);
+
+  caching_allocator_t alloc{};
+
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               cuda::std::remove_if(cuda_policy(alloc, launch), in.begin(), in.end(), is_even{});
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
@@ -0,0 +1,164 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_CUDA_REMOVE_IF_H
+#define _CUDA_STD___PSTL_CUDA_REMOVE_IF_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_BACKEND_CUDA()
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef")
+_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)
+
+#  include <cub/device/device_select.cuh>
+
+_CCCL_DIAG_POP
+
+#  include <cuda/__execution/policy.h>
+#  include <cuda/__functional/call_or.h>
+#  include <cuda/__memory_pool/device_memory_pool.h>
+#  include <cuda/__memory_resource/get_memory_resource.h>
+#  include <cuda/__runtime/api_wrapper.h>
+#  include <cuda/__stream/get_stream.h>
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__algorithm/remove_if.h>
+#  include <cuda/std/__exception/cuda_error.h>
+#  include <cuda/std/__exception/exception_macros.h>
+#  include <cuda/std/__execution/env.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__iterator/incrementable_traits.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__iterator/next.h>
+#  include <cuda/std/__pstl/cuda/temporary_storage.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__utility/move.h>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+template <>
+struct __pstl_dispatch<__pstl_algorithm::__remove_if, __execution_backend::__cuda>
+{
+  template <class _Policy, class _InputIterator, class _UnaryPredicate>
+  [[nodiscard]] _CCCL_HOST_API static _InputIterator __par_impl(
+    const _Policy& __policy, _InputIterator __first, iter_difference_t<_InputIterator> __count, _UnaryPredicate __pred)
+  {
+    using _OffsetType = iter_difference_t<_InputIterator>;
+    _OffsetType __ret;
+
+    auto __stream   = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
+    auto __resource = ::cuda::__call_or(
+      ::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool(__stream.device()), __policy);
+
+    // Determine temporary device storage requirements
+    void* __temp_storage = nullptr;
+    size_t __num_bytes   = 0;
+    _CCCL_TRY_CUDA_API(
+      ::cub::DeviceSelect::If,
+      "__pstl_cuda_select_if: determination of device storage for cub::DeviceSelect::If failed",
+      __temp_storage,
+      __num_bytes,
+      __first,
+      static_cast<_OffsetType*>(nullptr),
+      __count,
+      __pred,
+      __stream.get());
+
+    {
+      __temporary_storage<_OffsetType, decltype(__resource)> __storage{__stream, __resource, __num_bytes};
+
+      // Run the kernel
+      _CCCL_TRY_CUDA_API(
+        ::cub::DeviceSelect::If,
+        "__pstl_cuda_select_if: kernel launch of cub::DeviceSelect::If failed",
+        __storage.__get_temp_storage(),
+        __num_bytes,
+        ::cuda::std::move(__first),
+        __storage.__get_result_iter(),
+        __count,
+        ::cuda::std::move(__pred),
+        __stream.get());
+
+      // Copy the result back from storage
+      _CCCL_TRY_CUDA_API(
+        ::cudaMemcpyAsync,
+        "__pstl_cuda_select_if: copy of result from device to host failed",
+        ::cuda::std::addressof(__ret),
+        __storage.__res_,
+        sizeof(_OffsetType),
+        ::cudaMemcpyDefault,
+        __stream.get());
+    }
+
+    __stream.sync();
+    return __first + __ret;
+  }
+
+  _CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPredicate)
+  _CCCL_REQUIRES(__has_forward_traversal<_InputIterator>)
+  [[nodiscard]] _CCCL_HOST_API _InputIterator operator()(
+    [[maybe_unused]] const _Policy& __policy,
+    _InputIterator __first,
+    iter_difference_t<_InputIterator> __count,
+    _UnaryPredicate __pred) const
+  {
+    if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>)
+    {
+      try
+      {
+        return __par_impl(__policy, ::cuda::std::move(__first), __count, ::cuda::std::move(__pred));
+      }
+      catch (const ::cuda::cuda_error& __err)
+      {
+        if (__err.status() == ::cudaErrorMemoryAllocation)
+        {
+          _CCCL_THROW(::std::bad_alloc);
+        }
+        else
+        {
+          throw __err;
+        }
+      }
+    }
+    else
+    {
+      static_assert(__always_false_v<_Policy>,
+                    "__pstl_cuda_generate: CUDA backend of cuda::std::generate requires at least random access "
+                    "iterators");
+      auto __last = ::cuda::std::next(__first, __count);
+      return ::cuda::std::remove_if(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+    }
+  }
+};
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif /// _CCCL_HAS_BACKEND_CUDA()
+
+#endif // _CUDA_STD___PSTL_CUDA_REMOVE_IF_H
@@ -37,6 +37,7 @@ enum class __pstl_algorithm
   __for_each_n,
   __generate_n,
   __reduce,
+  __remove_if,
   __transform,
   __transform_reduce,
 };

@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_REMOVE_H
+#define _CUDA_STD___PSTL_REMOVE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__nvtx/nvtx.h>
+#  include <cuda/std/__algorithm/remove.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/distance.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/integral_constant.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/remove_if.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+template <class _Tp>
+struct __remove_compare_not_eq
+{
+  _Tp __val_;
+
+  _CCCL_API constexpr __remove_compare_not_eq(const _Tp& __val) noexcept(is_nothrow_copy_constructible_v<_Tp>)
+      : __val_(__val)
+  {}
+
+  template <class _Up>
+  [[nodiscard]] _CCCL_API _CCCL_FORCEINLINE constexpr bool operator()(const _Up& __rhs) const
+    noexcept(__is_cpp17_nothrow_equality_comparable_v<_Tp, _Up>)
+  {
+    return !(__val_ == __rhs);
+  }
+};
+
+_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _Tp)
+_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
+_CCCL_HOST_API _InputIterator
+remove([[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, const _Tp& __value)
+{
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__remove_if, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cuda::std::remove");
+
+    if (__first == __last)
+    {
+      return __first;
+    }
+    const auto __count = ::cuda::std::distance(__first, __last);
+    return __dispatch(__policy, __first, __count, __remove_compare_not_eq{__value});
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>, "Parallel cuda::std::remove requires at least one selected backend");
+    return ::cuda::std::remove(::cuda::std::move(__first), ::cuda::std::move(__last), __value);
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_REMOVE_H