Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize crc32 & crc32c on NVIDIA Grace #2204

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions folly/Portability.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,22 @@ constexpr auto kHasWeakSymbols = false;
#endif
#endif

#ifndef FOLLY_ARM_FEATURE_AES
#ifdef __ARM_FEATURE_AES
#define FOLLY_ARM_FEATURE_AES 1
#else
#define FOLLY_ARM_FEATURE_AES 0
#endif
#endif

#ifndef FOLLY_ARM_FEATURE_SHA2
#ifdef __ARM_FEATURE_SHA2
#define FOLLY_ARM_FEATURE_SHA2 1
#else
#define FOLLY_ARM_FEATURE_SHA2 0
#endif
#endif

// RTTI may not be enabled for this compilation unit.
#if defined(__GXX_RTTI) || defined(__cpp_rtti) || \
(defined(_MSC_VER) && defined(_CPPRTTI))
Expand Down
58 changes: 58 additions & 0 deletions folly/external/nvidia/hash/Checksum.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <folly/Portability.h>

#if FOLLY_ARM_FEATURE_CRC32
#include <arm_acle.h>
#include <cstring>
#include <stddef.h>

namespace folly::detail {

uint32_t crc32_hw(const uint8_t* buf, size_t len, uint32_t crc) {
while (len >= 8) {
uint64_t val = 0;
std::memcpy(&val, buf, 8);
crc = __crc32d(crc, val);
len -= 8;
buf += 8;
}

if (len % 8 >= 4) {
uint32_t val = 0;
std::memcpy(&val, buf, 4);
crc = __crc32w(crc, val);
buf += 4;
}

if (len % 4 >= 2) {
uint16_t val = 0;
std::memcpy(&val, buf, 2);
crc = __crc32h(crc, val);
buf += 2;
}

if (len % 2 >= 1) {
crc = __crc32b(crc, *buf);
}

return crc;
}

} // namespace folly::detail

#endif // FOLLY_ARM_FEATURE_CRC32
57 changes: 57 additions & 0 deletions folly/external/nvidia/hash/detail/Crc32cCombineDetail-inl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <folly/Portability.h>
#if FOLLY_NEON && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_AES && \
FOLLY_ARM_FEATURE_SHA2

#include <arm_acle.h>
#include <arm_neon.h>
namespace folly::detail {

static uint32_t gf_multiply_crc32c_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
const uint64x2_t count = vsetq_lane_u64(0, vdupq_n_u64(1), 1);

const poly128_t res0 = vmull_p64(crc2, crc1);
const uint64x2_t res1 =
vshlq_u64(vreinterpretq_u64_p128(res0), vreinterpretq_s64_u64(count));

// Use hardware crc32c to do reduction from 64 -> 32 bytes
const uint64_t res2 = vgetq_lane_u64(res1, 0);
const uint32_t res3 = __crc32cw(0, res2);
const uint32_t res4 = vgetq_lane_u32(vreinterpretq_u32_u64(res1), 1);

return res3 ^ res4;
}

static uint32_t gf_multiply_crc32_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
const uint64x2_t count = vsetq_lane_u64(0, vdupq_n_u64(1), 1);

const poly128_t res0 = vmull_p64(crc2, crc1);
const uint64x2_t res1 =
vshlq_u64(vreinterpretq_u64_p128(res0), vreinterpretq_s64_u64(count));

// Use hardware crc32 to do reduction from 64 -> 32 bytes
const uint64_t res2 = vgetq_lane_u64(res1, 0);
const uint32_t res3 = __crc32w(0, res2);
const uint32_t res4 = vgetq_lane_u32(vreinterpretq_u32_u64(res1), 1);

return res3 ^ res4;
}

} // namespace folly
#endif // FOLLY_ARM_FEATURE_CRC32
55 changes: 55 additions & 0 deletions folly/external/nvidia/hash/detail/Crc32cDetail.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


#include <folly/Portability.h>

#if FOLLY_ARM_FEATURE_CRC32
#include <arm_acle.h>
#include <cstring>

namespace folly::detail {
uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) {
while (len >= 8) {
uint64_t val = 0;
std::memcpy(&val, buf, 8);
crc = __crc32cd(crc, val);
len -= 8;
buf += 8;
}

if (len % 8 >= 4) {
uint32_t val = 0;
std::memcpy(&val, buf, 4);
crc = __crc32cw(crc, val);
buf += 4;
}

if (len % 4 >= 2) {
uint16_t val = 0;
std::memcpy(&val, buf, 2);
crc = __crc32ch(crc, val);
buf += 2;
}

if (len % 2 >= 1) {
crc = __crc32cb(crc, *buf);
}

return crc;
}
} // namespace folly::detail
#endif // FOLLY_ARM_FEATURE_CRC32
29 changes: 28 additions & 1 deletion folly/hash/Checksum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ uint32_t crc32_hw(
}

bool crc32c_hw_supported() {
return crc32c_hw_supported_sse42();
}

bool crc32c_hw_supported_sse42() {
static folly::CpuId id;
return id.sse42();
}
Expand All @@ -86,7 +90,26 @@ bool crc32_hw_supported() {
return id.sse42();
}

#else
#elif FOLLY_ARM_FEATURE_CRC32
// crc32_hw is defined in folly/external/nvidia/hash/Checksum.cpp

bool crc32c_hw_supported() {
return true;
}

bool crc32c_hw_supported_sse42() {
return false;
}

bool crc32c_hw_supported_avx512() {
return false;
}

bool crc32_hw_supported() {
return true;
}

#else // FOLLY_ARM_FEATURE_CRC32

uint32_t crc32_hw(
const uint8_t* /* data */,
Expand All @@ -99,6 +122,10 @@ bool crc32c_hw_supported() {
return false;
}

bool crc32c_hw_supported_sse42() {
return false;
}

bool crc32c_hw_supported_avx512() {
return false;
}
Expand Down
6 changes: 6 additions & 0 deletions folly/hash/detail/ChecksumDetail.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ uint32_t crc32c_hw(
* Check whether a SSE4.2 hardware-accelerated CRC-32C implementation is
* supported on the current CPU.
*/
bool crc32c_hw_supported_sse42();

/**
* Check whether a hardware-accelerated CRC-32C implementation is
* supported on the current CPU.
*/
bool crc32c_hw_supported();

/**
Expand Down
9 changes: 8 additions & 1 deletion folly/hash/detail/Crc32CombineDetail.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include <folly/Bits.h>
#include <folly/hash/detail/ChecksumDetail.h>

#include <folly/external/nvidia/hash/detail/Crc32cCombineDetail-inl.h>

namespace folly {

// Standard galois-field multiply. The only modification is that a,
Expand Down Expand Up @@ -105,6 +107,11 @@ static uint32_t gf_multiply_crc32_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(res3, res1), 4));
}

#elif FOLLY_NEON && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_AES && \
FOLLY_ARM_FEATURE_SHA2

// gf_multiply_crc32c_hw and fg_multiply_crc32_hw are defined in
// external/nvidia/hash/detail/Crc32cCombineDetail-inl.h
#else

static uint32_t gf_multiply_crc32c_hw(uint64_t, uint64_t, uint32_t) {
Expand All @@ -114,7 +121,7 @@ static uint32_t gf_multiply_crc32_hw(uint64_t, uint64_t, uint32_t) {
return 0;
}

#endif
#endif // FOLLY_SSE_PREREQ(4, 2)

static constexpr uint32_t crc32c_m = 0x82f63b78;
static constexpr uint32_t crc32_m = 0xedb88320;
Expand Down
4 changes: 3 additions & 1 deletion folly/hash/detail/Crc32cDetail.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,14 +286,16 @@ uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) {
return (uint32_t)crc0;
}

#elif defined(FOLLY_ARM_FEATURE_CRC32) // defined(FOLLY_X64) && FOLLY_SSE_PREREQ(4, 2)
// crc32c_hw is defined in external/nvidia/hash/detail/Crc32cDetail.cpp
#else

uint32_t crc32c_hw(
const uint8_t* /* buf */, size_t /* len */, uint32_t /* crc */) {
throw std::runtime_error("crc32_hw is not implemented on this platform");
}

#endif
#endif // !defined(FOLLY_ARM_FEATURE_CRC32)

} // namespace detail
} // namespace folly
8 changes: 4 additions & 4 deletions folly/hash/test/ChecksumTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ TEST(Checksum, crc32cContinuationHardware) {
}

TEST(Checksum, crc32cHardwareSse42) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
testCRC32C(folly::detail::sse_crc32c_v8s3x3);
} else {
LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests"
Expand All @@ -156,7 +156,7 @@ TEST(Checksum, crc32cHardwareSse42) {
}

TEST(Checksum, crc32cHardwareEqSse42) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
for (size_t i = 0; i < 1000; i++) {
auto sw = folly::detail::crc32c_sw(buffer, i, 0);
auto hw = folly::detail::sse_crc32c_v8s3x3(buffer, i, 0);
Expand All @@ -169,7 +169,7 @@ TEST(Checksum, crc32cHardwareEqSse42) {
}

TEST(Checksum, crc32cContinuationHardwareSse42) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
testCRC32CContinuation(folly::detail::sse_crc32c_v8s3x3);
} else {
LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests"
Expand Down Expand Up @@ -220,7 +220,7 @@ TEST(Checksum, crc32clargeBuffers) {

constexpr uint32_t kCrc = 2860399007;

if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
auto crcSse42 = folly::detail::sse_crc32c_v8s3x3(bufp, kLargeBufSz, ~0);
ASSERT_EQ(kCrc, crcSse42);
auto crcHw = folly::detail::crc32c_hw(bufp, kLargeBufSz, ~0);
Expand Down
Loading