Implement CRC32_u64, V128_Low64, and V128_Extract64 for x86 builds #1654

pps83 · 2024-04-18T01:38:12Z

_mm_cvtsi128_si64, _mm_crc32_u64, and _mm_extract_epi64 intrinsics are available only when building for x64. In order not to disable crc32 optimizations for 32-bit builds, equivalent code is implemented using intrinsics that are available when targeting 32-bit builds.

pps83 · 2024-04-18T19:52:48Z

Can this be changed to draft? This PR isn't ready, there is also V128_Extract64. When I made all the changes I decided to build with tests to verify, and it all passed. I tried to intentionally swap hi/lo to introduce errors to verify that tests fail ... and sure enough, my failed tests failed because they passed :) it appeared that cmake on windows builds without AVX (whaaat??). It took me some time to figure out how to enable AVX/AVX2 with cmake (total trainwreck to do it, had to manually add compilation cmd line param), it still passed even though code had intentional errors. My only guess so far is that all the code related to CPU detection is quite questionable: it's compiled out for 32-bit builds as if CPU detection is a 64-bit thing. Not sure if optimized CRC needs cpu detection to work (eg it's enabled at runtime or compile time?)
When I enabled cpu detection for 32-bit builds suddenly tests hung (endless loop somewhere) regardless if my updates had errors or not.
Individual instructions that I updated as part of the PR - I'm more or less sure that this is OK.

derekmauro · 2024-04-18T20:11:59Z

I converted it to a draft for you. Reply here when it is ready for me to look at.

pps83 · 2024-04-18T20:51:00Z

this is sample program that verify correctness. If you run it in 64-bit with asserts enabled you'll see that alternative code is equivalent:

#include <assert.h>
#include <stdint.h>
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <random>

using V128 = __m128i;

#if defined(__x86_64__) || defined(_M_X64)
#define ABSL_ARCH_X86_64
#endif

static uint64_t rand64()
{
    std::random_device rnd;
    return ((1ULL * rnd()) << 32) ^ rnd();
}

inline uint32_t CRC32_u32(uint32_t crc, uint32_t v)
{
    return _mm_crc32_u32(crc, v);
}

inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }

inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }

inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }

inline uint32_t CRC32_u64(uint32_t crc, uint64_t v)
{
#ifdef ABSL_ARCH_X86_64
    return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
#else
    uint32_t v_lo = static_cast<uint32_t>(v);
    uint32_t v_hi = static_cast<uint32_t>(v >> 32);
    return CRC32_u32(CRC32_u32(crc, v_lo), v_hi);
#endif
}

template <int imm>
inline uint64_t V128_Extract64(const V128 l)
{
#ifdef ABSL_ARCH_X86_64
    return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
#else
    uint32_t r_lo = static_cast<uint32_t>(_mm_extract_epi32(l, imm * 2 + 0));
    uint32_t r_hi = static_cast<uint32_t>(_mm_extract_epi32(l, imm * 2 + 1));
    return (static_cast<uint64_t>(r_hi) << 32) | r_lo;
#endif
}

inline int64_t V128_Low64(const V128 l)
{
#ifdef ABSL_ARCH_X86_64
    return _mm_cvtsi128_si64(l);
#else
    uint32_t r_lo = static_cast<uint32_t>(_mm_extract_epi32(l, 0));
    uint32_t r_hi = static_cast<uint32_t>(_mm_extract_epi32(l, 1));
    return static_cast<int64_t>((static_cast<uint64_t>(r_hi) << 32) | r_lo);
#endif
}

namespace x86_32
{
inline uint32_t CRC32_u64(uint32_t crc, uint64_t v)
{
    uint32_t v_lo = static_cast<uint32_t>(v);
    uint32_t v_hi = static_cast<uint32_t>(v >> 32);
    return CRC32_u32(CRC32_u32(crc, v_lo), v_hi);
}

template <int imm>
inline uint64_t V128_Extract64(const V128 l)
{
    uint32_t r_lo = static_cast<uint32_t>(_mm_extract_epi32(l, imm * 2 + 0));
    uint32_t r_hi = static_cast<uint32_t>(_mm_extract_epi32(l, imm * 2 + 1));
    return (static_cast<uint64_t>(r_hi) << 32) | r_lo;
}

inline int64_t V128_Low64(const V128 l)
{
    uint32_t r_lo = static_cast<uint32_t>(_mm_extract_epi32(l, 0));
    uint32_t r_hi = static_cast<uint32_t>(_mm_extract_epi32(l, 1));
    return static_cast<int64_t>((static_cast<uint64_t>(r_hi) << 32) | r_lo);
}
}


int main()
{
    // CRC32_u64
    for (size_t i = 0; i < 10; ++i)
    {
        uint64_t x = i * rand64();
        uint32_t crc = 0;
        auto r0 = CRC32_u64(crc, x);
        auto r1 = x86_32::CRC32_u64(crc, x);
        assert(r0 == r1);
    }

    V128 mem[10];
    uint64_t* p = (uint64_t*)&mem[0];
    for (size_t i = 0; i < 10; ++i)
    {
        p[i * 2 + 0] = i * rand64();
        p[i * 2 + 1] = i * rand64();
    }

    // V128_Load is identical to V128_LoadU
    for (size_t i = 0; i < 10; ++i)
    {
        V128 x[2];
        V128_Store(x + 0, V128_Load(&mem[i]));
        V128_Store(x + 1, V128_LoadU(&mem[i]));
        assert(0 == memcmp(x + 0, x + 1, sizeof(x[0]))); // V128_Load is identical to V128_LoadU
    }

    // V128_Low64
    for (size_t i = 0; i < 10; ++i)
    {
        auto r0 = V128_Low64(V128_Load(mem + i));
        auto r1 = x86_32::V128_Low64(V128_Load(mem + i));
        assert(r0 == r1);
    }

    // V128_Extract64<0>
    for (size_t i = 0; i < 10; ++i)
    {
        auto r0 = V128_Extract64<0>(V128_Load(mem + i));
        auto r1 = x86_32::V128_Extract64<0>(V128_Load(mem + i));
        assert(r0 == r1);
    }
    // V128_Extract64<1>
    for (size_t i = 0; i < 10; ++i)
    {
        auto r0 = V128_Extract64<1>(V128_Load(mem + i));
        auto r1 = x86_32::V128_Extract64<1>(V128_Load(mem + i));
        assert(r0 == r1);
    }
    return 0;
}

pps83 · 2024-04-20T08:16:13Z

@derekmauro PR is ready.

pps83 · 2024-04-20T08:50:42Z

When I enabled cpu detection for 32-bit builds suddenly tests hung (endless loop somewhere) regardless if my updates had errors or not.

For some reason Release builds in x86 will hung or throw SEH errors in CRC32AcceleratedX86ARMCombined::ComputeZeroConstant. When I attach with a debugger I see length to have some huge value when running absl_crc_cord_state_test. I tried to add printfs to print values of length inside the while(length) loop, but this fixes the error. If I build RelWithDebugInfo - then it's also ok. Very strange overall, as if it's some kind of compiler bug.

`_mm_cvtsi128_si64`, `_mm_crc32_u64`, and `_mm_extract_epi64` intrinsics are available only when building for x64. In order not to disable crc32 optimizations for 32-bit builds, equivalent code is implemented using intrinsics that are available when targeting 32-bit builds.

pps83 · 2024-04-22T01:33:09Z

Very strange overall, as if it's some kind of compiler bug.

I'm convinced that MS compiler has some sort of bug: https://developercommunity.visualstudio.com/t/Compiler-produces-bad-code-in-x86-releas/10642810

derekmauro · 2024-04-22T20:08:53Z

It looks like MSVC confirmed this is a compiler bug and have a fix pending. The good news is that you have made MSVC better for everyone, so congrats. The bad news is that I don't think we should accept this if the compiler is going to get is wrong. We probably need some preprocessor guarding of the code for the buggy compiler versions.

pps83 · 2024-04-24T01:16:18Z

It looks like MSVC confirmed this is a compiler bug and have a fix pending.

Yep, quite shocking. Nothing unusual about the code, seems like some intrinsic isn't handled properly. I think the bug was introduced years ago.

The good news is that you have made MSVC better for everyone, so congrats. The bad news is that I don't think we should accept this if the compiler is going to get is wrong. We probably need some preprocessor guarding of the code for the buggy compiler versions.

yep, compiler check is needed, or the code could be modified to avoid the bug.
Perhaps, multiply in crc_x86_arm_combined.cc can be adjusted. Is there alternative impl for the code that doesn't use intrinsics? Also, the last line there:

return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
       CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));

should perhaps be changed to

return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
       CRC32_u32(0, V128_Extract32<0>(res));

because it does static_cast<uint32_t>(V128_Low64(res)) which basically takes 64bit out or mm-register, then takes lower 32-bits from these 64 bits. All this while there is also V128_Extract32<1>(res) that extracts upper 32 bits from the 64 bit.

pps83 · 2024-04-24T01:18:34Z

The bad news is that I don't think we should accept this if the compiler is going to get is wrong

IMO, it should be taken, no point to exclude all x86 32-bit builds because MS compiler gets it wrong. Should perhaps be guarded by _MSC_VER (that doesn't have __clang__ or intel compiler defines).

pps83 · 2024-04-24T03:54:02Z

Here are the changes to multiply function that make it work with 32bit msvc build:

uint32_t multiply(uint32_t a, uint32_t b) {
  V128 res = V128_PMulLow(_mm_set1_epi64x(a), _mm_set1_epi64x(b));

  // Combine crc values
  res = _mm_add_epi64(res, res);
  return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
         CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
}

in short, if I replace V128_From64WithZeroFill with _mm_set1_epi64x then the bug goes away. In this particular function _mm_set1_epi64x vs _mm_set_epi64x is irrelevant, but fixes miscompile issue.

The other change that should be made there in the function:
replace (this doesn't fix msvc miscompile):

V128 shifts = V128_From64WithZeroFill(1);
...
res = V128_ShiftLeft64(res, shifts);

with

res = _mm_add_epi64(res, shifts);

Or, the rest of the function could be moved back to regular registers (this doesn't fix msvc miscompile):

    uint64_t x = uint64_t(V128_Low64(zz)) << 1;

    // Combine crc values
    return static_cast<uint32_t>(x >> 32) ^
        CRC32_u32(0, static_cast<uint32_t>(x));

derekmauro · 2024-05-08T16:59:47Z

I believe with your fix for multiply, if you rebase this, it should be ready for import, correct?

pps83 · 2024-05-11T15:05:31Z

I believe with your fix for multiply, if you rebase this, it should be ready for import, correct?

are you referring to add(a, a) vs a << 1? No, the change has no effect on the miscompile issue.

I think this PR can be merged, but should be disabled for ms compiler: defined(_MSC_VER) && !defined(__clang__)

Separately, the code could be amended to avoid compilation issue with ms compiler.

pps83 · 2024-05-11T17:45:03Z

Separately, the code could be amended to avoid compilation issue with ms compiler.

It seem that this change is needed to fix miscompile (at least it fixes the test I reported to MS):

inline V128 V128_From64WithZeroFill(const uint64_t r) {
#if defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
  return _mm_set_epi32(0, 0, static_cast<int>(r >> 32), static_cast<int>(r));
#else
  return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r));
#endif
}

With the change ms compiler actually generates much better code (compare it to the result that comes with _mm_set_epi64x).

However, it was still not enough. Looks like crc code has too many issues. After I applied the fix above, tests still failed. When I tried to debug, it appears that the CPU detection code is... not sure how to say, but it's very questionable. It tests for some old CPUs to enable optimized code, but even my a couple of generations old CPU gets skipped and tests as CpuType::kUnknown because GetIntelCpuType thinks that kIntelSkylake is the latest. I updated to code to return CpuType::kIntelSkylake, this fixed some tests, but still some other tests failed. Maybe something else gets miscompiled, I don't know, didn't try to check.

pps83 force-pushed the CRC32_u64-x86 branch from f265dff to 72c0978 Compare April 18, 2024 01:41

pps83 mentioned this pull request Apr 18, 2024

Remove unnecessary limitation for clang-cl or icx #1655

Closed

derekmauro marked this pull request as draft April 18, 2024 20:11

pps83 force-pushed the CRC32_u64-x86 branch from 72c0978 to f3717f8 Compare April 18, 2024 20:12

This comment was marked as outdated.

Sign in to view

pps83 changed the title ~~Implement CRC32_u64 and V128_Low64 for x86 builds~~ Implement CRC32_u64, V128_Low64, and V128_Extract64 for x86 builds Apr 18, 2024

pps83 force-pushed the CRC32_u64-x86 branch from f3717f8 to b6b9c4c Compare April 21, 2024 22:25

derekmauro added the kokoro:run label Apr 23, 2024

kokoro-team removed the kokoro:run label Apr 23, 2024

derekmauro marked this pull request as ready for review May 8, 2024 16:56

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implement CRC32_u64, V128_Low64, and V128_Extract64 for x86 builds #1654

Implement CRC32_u64, V128_Low64, and V128_Extract64 for x86 builds #1654

pps83 commented Apr 18, 2024 •

edited

Loading

pps83 commented Apr 18, 2024

derekmauro commented Apr 18, 2024

This comment was marked as outdated.

pps83 commented Apr 18, 2024 •

edited

Loading

pps83 commented Apr 20, 2024

pps83 commented Apr 20, 2024

pps83 commented Apr 22, 2024

derekmauro commented Apr 22, 2024

pps83 commented Apr 24, 2024

pps83 commented Apr 24, 2024 •

edited

Loading

pps83 commented Apr 24, 2024

derekmauro commented May 8, 2024

pps83 commented May 11, 2024 •

edited

Loading

pps83 commented May 11, 2024 •

edited

Loading

Implement CRC32_u64, V128_Low64, and V128_Extract64 for x86 builds #1654

Are you sure you want to change the base?

Implement CRC32_u64, V128_Low64, and V128_Extract64 for x86 builds #1654

Conversation

pps83 commented Apr 18, 2024 • edited Loading

pps83 commented Apr 18, 2024

derekmauro commented Apr 18, 2024

This comment was marked as outdated.

pps83 commented Apr 18, 2024 • edited Loading

pps83 commented Apr 20, 2024

pps83 commented Apr 20, 2024

pps83 commented Apr 22, 2024

derekmauro commented Apr 22, 2024

pps83 commented Apr 24, 2024

pps83 commented Apr 24, 2024 • edited Loading

pps83 commented Apr 24, 2024

derekmauro commented May 8, 2024

pps83 commented May 11, 2024 • edited Loading

pps83 commented May 11, 2024 • edited Loading

pps83 commented Apr 18, 2024 •

edited

Loading

pps83 commented Apr 18, 2024 •

edited

Loading

pps83 commented Apr 24, 2024 •

edited

Loading

pps83 commented May 11, 2024 •

edited

Loading

pps83 commented May 11, 2024 •

edited

Loading