Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize crc32 & crc32c on NVIDIA Grace #2204

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions folly/Portability.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,22 @@ constexpr auto kHasWeakSymbols = false;
#endif
#endif

#ifndef FOLLY_ARM_FEATURE_AES
#ifdef __ARM_FEATURE_AES
#define FOLLY_ARM_FEATURE_AES 1
#else
#define FOLLY_ARM_FEATURE_AES 0
#endif
#endif

#ifndef FOLLY_ARM_FEATURE_SHA2
#ifdef __ARM_FEATURE_SHA2
#define FOLLY_ARM_FEATURE_SHA2 1
#else
#define FOLLY_ARM_FEATURE_SHA2 0
#endif
#endif

// RTTI may not be enabled for this compilation unit.
#if defined(__GXX_RTTI) || defined(__cpp_rtti) || \
(defined(_MSC_VER) && defined(_CPPRTTI))
Expand Down
57 changes: 56 additions & 1 deletion folly/hash/Checksum.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,6 +33,11 @@
#include <nmmintrin.h>
#endif

#if FOLLY_ARM_FEATURE_CRC32
#include <arm_acle.h>
#include <stddef.h>
#endif

namespace folly {

namespace detail {
Expand Down Expand Up @@ -71,6 +77,10 @@ uint32_t crc32_hw(
}

bool crc32c_hw_supported() {
return crc32c_hw_supported_sse42();
}

bool crc32c_hw_supported_sse42() {
static folly::CpuId id;
return id.sse42();
}
Expand All @@ -86,7 +96,48 @@ bool crc32_hw_supported() {
return id.sse42();
}

#else
#elif FOLLY_ARM_FEATURE_CRC32
uint32_t crc32_hw(const uint8_t* buf, size_t len, uint32_t crc) {
auto* buf_64 = reinterpret_cast<const uint64_t*>(buf);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like unaligned access.

Note that while an architecture may support unaligned access, the language generally deems unaligned access of this form to be undefined behavior. The kind of undefined behavior that may end up being subject to miscompilations.

To avoid this unaligned access, do we need to start off with an optional round of each of __crc32b, __crc32h, __crc32w, and __crc32d? Alternatively, perhaps we can memcpy from buf instead of reinterpret_cast - modern compilers recognize this idiom and lower to mov or ldr instructions without emitting calls to memcpy.

Here and crc32c_hw below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, you're totally right. Unaligned access isn't directly forbidden in C++ but my implementation is incorrect by the strict aliasing rule. While this is typically fine when operating on bytes my code is still incorrect.

The memcpy version is a good idea. It's obviously correct and at a first glance seems to lead to (nearly) the same code for GCC/clang. I'll test it and update the PR.

while (len >= 8) {
crc = __crc32d(crc, *buf_64++);
len -= 8;
}

auto* buf_32 = reinterpret_cast<const uint32_t*>(buf_64);
if (len % 8 >= 4) {
crc = __crc32w(crc, *buf_32++);
}

auto* buf_16 = reinterpret_cast<const uint16_t*>(buf_32);
if (len % 4 >= 2) {
crc = __crc32h(crc, *buf_16++);
}

auto* buf_8 = reinterpret_cast<const uint8_t*>(buf_16);
if (len % 2 >= 1) {
crc = __crc32b(crc, *buf_8++);
}
return crc;
}

bool crc32c_hw_supported() {
return true;
}

bool crc32c_hw_supported_sse42() {
return false;
}

bool crc32c_hw_supported_avx512() {
return false;
}

bool crc32_hw_supported() {
return true;
}

#else // FOLLY_ARM_FEATURE_CRC32

uint32_t crc32_hw(
const uint8_t* /* data */,
Expand All @@ -99,6 +150,10 @@ bool crc32c_hw_supported() {
return false;
}

bool crc32c_hw_supported_sse42() {
return false;
}

bool crc32c_hw_supported_avx512() {
return false;
}
Expand Down
6 changes: 6 additions & 0 deletions folly/hash/detail/ChecksumDetail.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ uint32_t crc32c_hw(
* Check whether a SSE4.2 hardware-accelerated CRC-32C implementation is
* supported on the current CPU.
*/
bool crc32c_hw_supported_sse42();

/**
* Check whether a hardware-accelerated CRC-32C implementation is
* supported on the current CPU.
*/
bool crc32c_hw_supported();

/**
Expand Down
41 changes: 39 additions & 2 deletions folly/hash/detail/Crc32CombineDetail.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,6 +21,11 @@
#include <folly/Bits.h>
#include <folly/hash/detail/ChecksumDetail.h>

#if FOLLY_ARM_FEATURE_CRC32
#include <arm_acle.h>
#include <arm_neon.h>
#endif

namespace folly {

// Standard galois-field multiply. The only modification is that a,
Expand Down Expand Up @@ -105,7 +111,38 @@ static uint32_t gf_multiply_crc32_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(res3, res1), 4));
}

#else
#elif FOLLY_NEON && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_AES && \
FOLLY_ARM_FEATURE_SHA2
static uint32_t gf_multiply_crc32c_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
const uint64x2_t count = vsetq_lane_u64(0, vdupq_n_u64(1), 1);

const poly128_t res0 = vmull_p64(crc2, crc1);
const uint64x2_t res1 =
vshlq_u64(vreinterpretq_u64_p128(res0), vreinterpretq_s64_u64(count));

// Use hardware crc32c to do reduction from 64 -> 32 bytes
const uint64_t res2 = vgetq_lane_u64(res1, 0);
const uint32_t res3 = __crc32cw(0, res2);
const uint32_t res4 = vgetq_lane_u32(vreinterpretq_u32_u64(res1), 1);

return res3 ^ res4;
}

static uint32_t gf_multiply_crc32_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
const uint64x2_t count = vsetq_lane_u64(0, vdupq_n_u64(1), 1);

const poly128_t res0 = vmull_p64(crc2, crc1);
const uint64x2_t res1 =
vshlq_u64(vreinterpretq_u64_p128(res0), vreinterpretq_s64_u64(count));

// Use hardware crc32 to do reduction from 64 -> 32 bytes
const uint64_t res2 = vgetq_lane_u64(res1, 0);
const uint32_t res3 = __crc32w(0, res2);
const uint32_t res4 = vgetq_lane_u32(vreinterpretq_u32_u64(res1), 1);

return res3 ^ res4;
}
#else // FOLLY_ARM_FEATURE_CRC32

static uint32_t gf_multiply_crc32c_hw(uint64_t, uint64_t, uint32_t) {
return 0;
Expand All @@ -114,7 +151,7 @@ static uint32_t gf_multiply_crc32_hw(uint64_t, uint64_t, uint32_t) {
return 0;
}

#endif
#endif // FOLLY_SSE_PREREQ(4, 2)

static constexpr uint32_t crc32c_m = 0x82f63b78;
static constexpr uint32_t crc32_m = 0xedb88320;
Expand Down
35 changes: 34 additions & 1 deletion folly/hash/detail/Crc32cDetail.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
* use intrinsics instead of inline asm
* other code cleanup
*/
/*
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*/

#include <stdexcept>

Expand All @@ -38,6 +41,10 @@
#include <folly/CppAttributes.h>
#include <folly/hash/detail/ChecksumDetail.h>

#if FOLLY_ARM_FEATURE_CRC32
#include <arm_acle.h>
#endif

namespace folly {
namespace detail {

Expand Down Expand Up @@ -286,7 +293,33 @@ uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) {
return (uint32_t)crc0;
}

#else
#elif FOLLY_ARM_FEATURE_CRC32 // defined(FOLLY_X64) && FOLLY_SSE_PREREQ(4, 2)

uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) {
auto* buf_64 = reinterpret_cast<const uint64_t*>(buf);
while (len >= 8) {
crc = __crc32cd(crc, *buf_64++);
len -= 8;
}

auto* buf_32 = reinterpret_cast<const uint32_t*>(buf_64);
if (len % 8 >= 4) {
crc = __crc32cw(crc, *buf_32++);
}

auto* buf_16 = reinterpret_cast<const uint16_t*>(buf_32);
if (len % 4 >= 2) {
crc = __crc32ch(crc, *buf_16++);
}

auto* buf_8 = reinterpret_cast<const uint8_t*>(buf_16);
if (len % 2 >= 1) {
crc = __crc32cb(crc, *buf_8++);
}
return crc;
}

#else // FOLLY_ARM_FEATURE_CRC32

uint32_t crc32c_hw(
const uint8_t* /* buf */, size_t /* len */, uint32_t /* crc */) {
Expand Down
8 changes: 4 additions & 4 deletions folly/hash/test/ChecksumTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ TEST(Checksum, crc32cContinuationHardware) {
}

TEST(Checksum, crc32cHardwareSse42) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
testCRC32C(folly::detail::sse_crc32c_v8s3x3);
} else {
LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests"
Expand All @@ -156,7 +156,7 @@ TEST(Checksum, crc32cHardwareSse42) {
}

TEST(Checksum, crc32cHardwareEqSse42) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
for (size_t i = 0; i < 1000; i++) {
auto sw = folly::detail::crc32c_sw(buffer, i, 0);
auto hw = folly::detail::sse_crc32c_v8s3x3(buffer, i, 0);
Expand All @@ -169,7 +169,7 @@ TEST(Checksum, crc32cHardwareEqSse42) {
}

TEST(Checksum, crc32cContinuationHardwareSse42) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
testCRC32CContinuation(folly::detail::sse_crc32c_v8s3x3);
} else {
LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests"
Expand Down Expand Up @@ -220,7 +220,7 @@ TEST(Checksum, crc32clargeBuffers) {

constexpr uint32_t kCrc = 2860399007;

if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
auto crcSse42 = folly::detail::sse_crc32c_v8s3x3(bufp, kLargeBufSz, ~0);
ASSERT_EQ(kCrc, crcSse42);
auto crcHw = folly::detail::crc32c_hw(bufp, kLargeBufSz, ~0);
Expand Down
Loading