Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions libcudacxx/benchmarks/bench/remove/basic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/complex>
#include <cuda/stream>

#include "nvbench_helper.cuh"

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
{
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42});
const auto count = cuda::std::count(cuda::execution::__cub_par_unseq, in.begin(), in.end(), T{42});

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements - count);

caching_allocator_t alloc{};

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
cuda::std::remove(cuda_policy(alloc, launch), in.begin(), in.end(), T{42});
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("base")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
56 changes: 56 additions & 0 deletions libcudacxx/benchmarks/bench/remove_if/basic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <thrust/device_vector.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/std/complex>
#include <cuda/stream>

#include "nvbench_helper.cuh"

struct is_even
{
template <class T>
__device__ constexpr bool operator()(const T& val) const noexcept
{
return static_cast<int>(val) % 2 == 0;
}

__device__ constexpr bool operator()(const complex& val) const noexcept
{
return static_cast<int>(val.real()) % 2 == 0;
}
};

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
{
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> in = generate(elements, bit_entropy::_1_000, T{0}, T{42});

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
state.add_global_memory_writes<T>(elements / 2);

caching_allocator_t alloc{};

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
cuda::std::remove_if(cuda_policy(alloc, launch), in.begin(), in.end(), is_even{});
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("base")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
164 changes: 164 additions & 0 deletions libcudacxx/include/cuda/std/__pstl/cuda/remove_if.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_STD___PSTL_CUDA_REMOVE_IF_H
#define _CUDA_STD___PSTL_CUDA_REMOVE_IF_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if _CCCL_HAS_BACKEND_CUDA()

_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef")
_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)

# include <cub/device/device_select.cuh>

_CCCL_DIAG_POP

# include <cuda/__execution/policy.h>
# include <cuda/__functional/call_or.h>
# include <cuda/__memory_pool/device_memory_pool.h>
# include <cuda/__memory_resource/get_memory_resource.h>
# include <cuda/__runtime/api_wrapper.h>
# include <cuda/__stream/get_stream.h>
# include <cuda/__stream/stream_ref.h>
# include <cuda/std/__algorithm/remove_if.h>
# include <cuda/std/__exception/cuda_error.h>
# include <cuda/std/__exception/exception_macros.h>
# include <cuda/std/__execution/env.h>
# include <cuda/std/__execution/policy.h>
# include <cuda/std/__iterator/incrementable_traits.h>
# include <cuda/std/__iterator/iterator_traits.h>
# include <cuda/std/__iterator/next.h>
# include <cuda/std/__pstl/cuda/temporary_storage.h>
# include <cuda/std/__pstl/dispatch.h>
# include <cuda/std/__type_traits/always_false.h>
# include <cuda/std/__utility/move.h>

# include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION

_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT

template <>
struct __pstl_dispatch<__pstl_algorithm::__remove_if, __execution_backend::__cuda>
{
template <class _Policy, class _InputIterator, class _UnaryPredicate>
[[nodiscard]] _CCCL_HOST_API static _InputIterator __par_impl(
const _Policy& __policy, _InputIterator __first, iter_difference_t<_InputIterator> __count, _UnaryPredicate __pred)
{
using _OffsetType = iter_difference_t<_InputIterator>;
_OffsetType __ret;

auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
auto __resource = ::cuda::__call_or(
::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool(__stream.device()), __policy);

// Determine temporary device storage requirements
void* __temp_storage = nullptr;
size_t __num_bytes = 0;
_CCCL_TRY_CUDA_API(
::cub::DeviceSelect::If,
"__pstl_cuda_select_if: determination of device storage for cub::DeviceSelect::If failed",
__temp_storage,
__num_bytes,
__first,
static_cast<_OffsetType*>(nullptr),
__count,
__pred,
__stream.get());

{
__temporary_storage<_OffsetType, decltype(__resource)> __storage{__stream, __resource, __num_bytes};

// Run the kernel
_CCCL_TRY_CUDA_API(
::cub::DeviceSelect::If,
"__pstl_cuda_select_if: kernel launch of cub::DeviceSelect::If failed",
__storage.__get_temp_storage(),
__num_bytes,
::cuda::std::move(__first),
__storage.__get_result_iter(),
__count,
::cuda::std::move(__pred),
__stream.get());

// Copy the result back from storage
_CCCL_TRY_CUDA_API(
::cudaMemcpyAsync,
"__pstl_cuda_select_if: copy of result from device to host failed",
::cuda::std::addressof(__ret),
__storage.__res_,
sizeof(_OffsetType),
::cudaMemcpyDefault,
__stream.get());
}

__stream.sync();
return __first + __ret;
}

_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _UnaryPredicate)
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator>)
[[nodiscard]] _CCCL_HOST_API _InputIterator operator()(
[[maybe_unused]] const _Policy& __policy,
_InputIterator __first,
iter_difference_t<_InputIterator> __count,
_UnaryPredicate __pred) const
{
if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>)
{
try
{
return __par_impl(__policy, ::cuda::std::move(__first), __count, ::cuda::std::move(__pred));
}
catch (const ::cuda::cuda_error& __err)
{
if (__err.status() == ::cudaErrorMemoryAllocation)
{
_CCCL_THROW(::std::bad_alloc);
}
else
{
throw __err;
}
}
}
else
{
static_assert(__always_false_v<_Policy>,
"__pstl_cuda_generate: CUDA backend of cuda::std::generate requires at least random access "
"iterators");
auto __last = ::cuda::std::next(__first, __count);
return ::cuda::std::remove_if(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
}
}
};

_CCCL_END_NAMESPACE_ARCH_DEPENDENT

_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION

# include <cuda/std/__cccl/epilogue.h>

#endif /// _CCCL_HAS_BACKEND_CUDA()

#endif // _CUDA_STD___PSTL_CUDA_REMOVE_IF_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/std/__pstl/dispatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ enum class __pstl_algorithm
__for_each_n,
__generate_n,
__reduce,
__remove_if,
__transform,
__transform_reduce,
};
Expand Down
99 changes: 99 additions & 0 deletions libcudacxx/include/cuda/std/__pstl/remove.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_STD___PSTL_REMOVE_H
#define _CUDA_STD___PSTL_REMOVE_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if !_CCCL_COMPILER(NVRTC)

# include <cuda/__nvtx/nvtx.h>
# include <cuda/std/__algorithm/remove.h>
# include <cuda/std/__concepts/concept_macros.h>
# include <cuda/std/__execution/policy.h>
# include <cuda/std/__iterator/concepts.h>
# include <cuda/std/__iterator/distance.h>
# include <cuda/std/__iterator/iterator_traits.h>
# include <cuda/std/__pstl/dispatch.h>
# include <cuda/std/__type_traits/always_false.h>
# include <cuda/std/__type_traits/integral_constant.h>
# include <cuda/std/__type_traits/is_execution_policy.h>
# include <cuda/std/__utility/move.h>

# if _CCCL_HAS_BACKEND_CUDA()
# include <cuda/std/__pstl/cuda/remove_if.h>
# endif // _CCCL_HAS_BACKEND_CUDA()

# include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD

_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT

template <class _Tp>
struct __remove_compare_not_eq
{
_Tp __val_;

_CCCL_API constexpr __remove_compare_not_eq(const _Tp& __val) noexcept(is_nothrow_copy_constructible_v<_Tp>)
: __val_(__val)
{}

template <class _Up>
[[nodiscard]] _CCCL_API _CCCL_FORCEINLINE constexpr bool operator()(const _Up& __rhs) const
noexcept(__is_cpp17_nothrow_equality_comparable_v<_Tp, _Up>)
{
return !(__val_ == __rhs);
}
};

_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _Tp)
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
_CCCL_HOST_API _InputIterator
remove([[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, const _Tp& __value)
{
[[maybe_unused]] auto __dispatch =
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__remove_if, _Policy>();
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
{
_CCCL_NVTX_RANGE_SCOPE("cuda::std::remove");

if (__first == __last)
{
return __first;
}
const auto __count = ::cuda::std::distance(__first, __last);
return __dispatch(__policy, __first, __count, __remove_compare_not_eq{__value});
}
else
{
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::remove requires at least one selected backend");
return ::cuda::std::remove(::cuda::std::move(__first), ::cuda::std::move(__last), __value);
}
}

_CCCL_END_NAMESPACE_ARCH_DEPENDENT

_CCCL_END_NAMESPACE_CUDA_STD

# include <cuda/std/__cccl/epilogue.h>

#endif // !_CCCL_COMPILER(NVRTC)

#endif // _CUDA_STD___PSTL_REMOVE_H
Loading