Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure cuda::std::bit_cast works with vector types #3183

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions libcudacxx/include/cuda/std/__bit/bit_cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@
# pragma system_header
#endif // no system header

#include <cuda/std/__fwd/array.h>
#include <cuda/std/__tuple_dir/sfinae_helpers.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/is_extended_floating_point.h>
#include <cuda/std/__type_traits/is_trivially_copyable.h>
#include <cuda/std/__type_traits/is_trivially_default_constructible.h>
#include <cuda/std/__type_traits/remove_extent.h>
#include <cuda/std/detail/libcxx/include/cstring>

_LIBCUDACXX_BEGIN_NAMESPACE_STD
Expand All @@ -40,12 +43,23 @@ _CCCL_DIAG_SUPPRESS_GCC("-Wclass-memaccess")
# endif // _CCCL_COMPILER(GCC, >=, 8)
#endif // !_CCCL_BUILTIN_BIT_CAST

template <
class _To,
class _From,
enable_if_t<(sizeof(_To) == sizeof(_From)), int> = 0,
enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _To) || _CCCL_TRAIT(__is_extended_floating_point, _To), int> = 0,
enable_if_t<_CCCL_TRAIT(is_trivially_copyable, _From) || _CCCL_TRAIT(__is_extended_floating_point, _From), int> = 0>
// Also allow arrays / tuples
template <class _Tp>
struct __is_bit_castable
{
static constexpr bool value =
_CCCL_TRAIT(is_trivially_copyable, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp);
};

#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
template <class _Tp>
_CCCL_INLINE_VAR constexpr bool __is_bit_castable_v =
_CCCL_TRAIT(is_trivially_copyable, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp);
#endif // !_CCCL_NO_VARIABLE_TEMPLATES

_CCCL_TEMPLATE(class _To, class _From)
_CCCL_REQUIRES((sizeof(_To) == sizeof(_From)) //
_CCCL_AND _CCCL_TRAIT(__is_bit_castable, _To) _CCCL_AND _CCCL_TRAIT(__is_bit_castable, _From))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_CONSTEXPR_BIT_CAST _To bit_cast(const _From& __from) noexcept
{
#if defined(_CCCL_BUILTIN_BIT_CAST)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

// <cuda/std/bit>
//
// template<class To, class From>
// constexpr To bit_cast(const From& from) noexcept;

#include <cuda/std/array>
#include <cuda/std/bit>
#include <cuda/std/cassert>
#include <cuda/std/cmath>
#include <cuda/std/cstdint>
#include <cuda/std/limits>
#include <cuda/std/tuple>
#include <cuda/std/utility>

#include "test_macros.h"

__host__ __device__ cuda::std::size_t test_memcmp(void* lhs, void* rhs, size_t bytes) noexcept
{
const unsigned char* clhs = (const unsigned char*) lhs;
const unsigned char* crhs = (const unsigned char*) rhs;

for (; bytes > 0; --bytes)
{
if (*clhs++ != *crhs++)
{
return clhs[-1] < crhs[-1] ? -1 : 1;
}
}
return 0;
}

template <class T>
__host__ __device__ bool equal(T& lhs, T& rhs)
{
return test_memcmp(&lhs, &rhs, sizeof(T)) == 0;
}

// cuda::std::bit_cast does not preserve padding bits, so if T has padding bits,
// the results might not memcmp cleanly.
template <bool HasUniqueObjectRepresentations = true, typename T>
__host__ __device__ void test_roundtrip_through_buffer(T from)
{
struct Buffer
{
char buffer[sizeof(T)];
};
Buffer middle = cuda::std::bit_cast<Buffer>(from);
T to = cuda::std::bit_cast<T>(middle);
Buffer middle2 = cuda::std::bit_cast<Buffer>(to);

assert(equal(from, to) == equal(from, from)); // because NaN

_CCCL_IF_CONSTEXPR (HasUniqueObjectRepresentations)
{
assert(test_memcmp(&from, &middle, sizeof(T)) == 0);
assert(test_memcmp(&to, &middle, sizeof(T)) == 0);
assert(test_memcmp(&middle, &middle2, sizeof(T)) == 0);
}
}

template <typename T>
__host__ __device__ _LIBCUDACXX_CONSTEXPR_BIT_CAST cuda::std::array<T, 10> generate_signed_integral_values()
{
return {cuda::std::numeric_limits<T>::min(),
cuda::std::numeric_limits<T>::min() + 1,
static_cast<T>(-2),
static_cast<T>(-1),
static_cast<T>(0),
static_cast<T>(1),
static_cast<T>(2),
static_cast<T>(3),
cuda::std::numeric_limits<T>::max() - 1,
cuda::std::numeric_limits<T>::max()};
}

#define REPEAT_1(base_type, index) base_type(input[index][0])
#define REPEAT_2(base_type, index) REPEAT_1(base_type, index), base_type(input[index][1])
#define REPEAT_3(base_type, index) REPEAT_2(base_type, index), base_type(input[index][2])
#define REPEAT_4(base_type, index) REPEAT_3(base_type, index), base_type(input[index][3])

#define TEST_CUDA_VECTOR_TYPE(base_type, size) \
{ \
for (base_type##size i : \
{base_type##size{REPEAT_##size(base_type, 0)}, \
base_type##size{REPEAT_##size(base_type, 1)}, \
base_type##size{REPEAT_##size(base_type, 2)}, \
base_type##size{REPEAT_##size(base_type, 3)}, \
base_type##size{REPEAT_##size(base_type, 4)}, \
base_type##size{REPEAT_##size(base_type, 5)}, \
base_type##size{REPEAT_##size(base_type, 6)}}) \
{ \
test_roundtrip_through_buffer(i); \
} \
}

#define TEST_CUDA_VECTOR_TYPES(base_type) \
TEST_CUDA_VECTOR_TYPE(base_type, 1) \
TEST_CUDA_VECTOR_TYPE(base_type, 2) \
TEST_CUDA_VECTOR_TYPE(base_type, 3) \
TEST_CUDA_VECTOR_TYPE(base_type, 4)

template <typename T>
__host__ __device__ _LIBCUDACXX_CONSTEXPR_BIT_CAST cuda::std::array<T, 6> generate_unsigned_integral_values()
{
return {static_cast<T>(0),
static_cast<T>(1),
static_cast<T>(2),
static_cast<T>(3),
static_cast<T>(cuda::std::numeric_limits<T>::max() - 1),
cuda::std::numeric_limits<T>::max()};
}

__host__ __device__ bool tests()
{
using pair = cuda::std::pair<float, int>;
for (pair i :
{pair{0.0f, 1},
pair{1.0f, 2},
pair{-1.0f, 3},
pair{10.0f, 4},
pair{-10.0f, 5},
pair{2.71828f, 6},
pair{3.14159f, 7}})
{
test_roundtrip_through_buffer(i);
}

#if defined(_CCCL_BUILTIN_BIT_CAST) // tuple is not trivially default constructible
using tuple = cuda::std::tuple<float, int, short>;
for (tuple i :
{tuple{0.0f, 1, -1},
tuple{1.0f, 2, -2},
tuple{-1.0f, 3, -3},
tuple{10.0f, 4, -4},
tuple{-10.0f, 5, -5},
tuple{2.71828f, 6, -6},
tuple{3.14159f, 7, -7}})
{
test_roundtrip_through_buffer(i);
}
#endif // _CCCL_BUILTIN_BIT_CAST

using array = cuda::std::array<float, 2>;
for (array i :
{array{0.0f, 1.0f},
array{1.0f, 2.0f},
array{-1.0f, 3.0f},
array{10.0f, 4.0f},
array{-10.0f, 5.0f},
array{2.71828f, 6.0f},
array{3.14159f, 7.0f}})
{
test_roundtrip_through_buffer(i);
}

float carray[2] = {0.0f, 1.0f};
test_roundtrip_through_buffer(carray);

// test cuda vector types except __half2 and __nv_bfloat162 because they are cursed
constexpr double input[7][4] = {
{0.0, 1.0, -7.0, -0.0},
{1.0, 2.0, -7.0, -1.0},
{-1.0, 3.0, -7.0, 1.0},
{10.0, 4.0, -7.0, -10.0},
{-10.0, 5.0, -7.0, 10.0},
{2.71828, 6.0, -7.0, -2.71828},
{3.14159, 7.0, -7.0, -3.14159}};

#if !_CCCL_CUDA_COMPILER(CLANG)
using uchar = unsigned char;
using ushort = unsigned short;
using uint = unsigned int;
using ulong = unsigned long;
#endif // !_CCCL_CUDA_COMPILER(CLANG)
using longlong = long long;
using ulonglong = unsigned long long;

TEST_CUDA_VECTOR_TYPES(char)
TEST_CUDA_VECTOR_TYPES(uchar)
TEST_CUDA_VECTOR_TYPES(short)
TEST_CUDA_VECTOR_TYPES(ushort)
TEST_CUDA_VECTOR_TYPES(int)
TEST_CUDA_VECTOR_TYPES(uint)
TEST_CUDA_VECTOR_TYPES(long)
TEST_CUDA_VECTOR_TYPES(ulong)
TEST_CUDA_VECTOR_TYPES(longlong)
TEST_CUDA_VECTOR_TYPES(ulonglong)
TEST_CUDA_VECTOR_TYPES(float)
TEST_CUDA_VECTOR_TYPES(double)

using dim = unsigned int;
TEST_CUDA_VECTOR_TYPE(dim, 3)

#ifdef _LIBCUDACXX_HAS_NVFP16
// Extended floating point type __half
for (__half i :
{__float2half(0.0f),
__float2half(1.0f),
__float2half(-1.0f),
__float2half(10.0f),
__float2half(-10.0f),
__float2half(2.71828f),
__float2half(3.14159f)})
{
test_roundtrip_through_buffer(i);
}
#endif // _LIBCUDACXX_HAS_NVFP16

#ifdef _LIBCUDACXX_HAS_NVBF16
// Extended floating point type __nv_bfloat16
for (__nv_bfloat16 i :
{__float2bfloat16(0.0f),
__float2bfloat16(1.0f),
__float2bfloat16(-1.0f),
__float2bfloat16(10.0f),
__float2bfloat16(-10.0f),
__float2bfloat16(2.71828f),
__float2bfloat16(3.14159f)})
{
test_roundtrip_through_buffer(i);
}
#endif // _LIBCUDACXX_HAS_NVBF16

return true;
}

int main(int, char**)
{
tests();
return 0;
}
Loading