From f33e48e5f9dfcdd821b6be28455c05ed5aebf217 Mon Sep 17 00:00:00 2001 From: Mryange <59914473+Mryange@users.noreply.github.com> Date: Mon, 19 Aug 2024 15:50:54 +0800 Subject: [PATCH] [opt](arm) Optimize the BlockBloomFilter::bucket_find on ARM platforms using NEON instructions. (#38888) ## Proposed changes ``` -------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------- BM_BucketFindNeon 8.14 ns 8.14 ns 344002441 BM_BucketFindNative 17.5 ns 17.5 ns 160152491 ``` --- be/src/exprs/block_bloom_filter.hpp | 36 ++++++++++++++++++++++++- be/src/exprs/block_bloom_filter_impl.cc | 29 +++++++++++++++++--- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/be/src/exprs/block_bloom_filter.hpp b/be/src/exprs/block_bloom_filter.hpp index f31d7f7d4c0517..b7d488a3003c3d 100644 --- a/be/src/exprs/block_bloom_filter.hpp +++ b/be/src/exprs/block_bloom_filter.hpp @@ -124,6 +124,39 @@ class BlockBloomFilter { return false; } +#ifdef __ARM_NEON + void make_find_mask(uint32_t key, uint32x4_t* masks) const noexcept { + uint32x4_t hash_data_1 = vdupq_n_u32(key); + uint32x4_t hash_data_2 = vdupq_n_u32(key); + + uint32x4_t rehash_1 = vld1q_u32(&kRehash[0]); + uint32x4_t rehash_2 = vld1q_u32(&kRehash[4]); + + // masks[i] = key * kRehash[i]; + hash_data_1 = vmulq_u32(rehash_1, hash_data_1); + hash_data_2 = vmulq_u32(rehash_2, hash_data_2); + // masks[i] = masks[i] >> shift_num; + hash_data_1 = vshrq_n_u32(hash_data_1, shift_num); + hash_data_2 = vshrq_n_u32(hash_data_2, shift_num); + + const uint32x4_t ones = vdupq_n_u32(1); + + // masks[i] = 0x1 << masks[i]; + masks[0] = vshlq_u32(ones, reinterpret_cast(hash_data_1)); + masks[1] = vshlq_u32(ones, reinterpret_cast(hash_data_2)); + } +#else + void make_find_mask(uint32_t key, uint32_t* masks) const noexcept { + for (int i = 0; i < kBucketWords; ++i) { + masks[i] = key * kRehash[i]; + + masks[i] = masks[i] >> shift_num; + + masks[i] = 0x1 << masks[i]; + } + } +#endif + // Computes the logical OR of this filter with 'other' and stores the result in this // filter. // Notes: @@ -163,7 +196,8 @@ class BlockBloomFilter { // log2(number of bits in a BucketWord) static constexpr int kLogBucketWordBits = 5; static constexpr BucketWord kBucketWordMask = (1 << kLogBucketWordBits) - 1; - + // (>> 27) is equivalent to (mod 32) + static constexpr auto shift_num = ((1 << kLogBucketWordBits) - kLogBucketWordBits); // log2(number of bytes in a bucket) static constexpr int kLogBucketByteSize = 5; // Bucket size in bytes. diff --git a/be/src/exprs/block_bloom_filter_impl.cc b/be/src/exprs/block_bloom_filter_impl.cc index d285edcb310e1b..e89b91422662a9 100644 --- a/be/src/exprs/block_bloom_filter_impl.cc +++ b/be/src/exprs/block_bloom_filter_impl.cc @@ -138,14 +138,37 @@ void BlockBloomFilter::bucket_insert(const uint32_t bucket_idx, const uint32_t h } bool BlockBloomFilter::bucket_find(const uint32_t bucket_idx, const uint32_t hash) const noexcept { +#if defined(__ARM_NEON) + uint32x4_t masks[2]; + + uint32x4_t directory_1 = vld1q_u32(&_directory[bucket_idx][0]); + uint32x4_t directory_2 = vld1q_u32(&_directory[bucket_idx][4]); + + make_find_mask(hash, masks); + // The condition for returning true is that all the bits in _directory[bucket_idx][i] specified by masks[i] are 1. + // This can be equivalently expressed as all the bits in not( _directory[bucket_idx][i]) specified by masks[i] are 0. + // vbicq_u32(vec1, vec2) : Result of (vec1 AND NOT vec2) + // If true is returned, out_1 and out_2 should be all zeros. + uint32x4_t out_1 = vbicq_u32(masks[0], directory_1); + uint32x4_t out_2 = vbicq_u32(masks[1], directory_2); + + out_1 = vorrq_u32(out_1, out_2); + + uint32x2_t low = vget_low_u32(out_1); + uint32x2_t high = vget_high_u32(out_1); + low = vorr_u32(low, high); + uint32_t res = vget_lane_u32(low, 0) | vget_lane_u32(low, 1); + return !(res); +#else + uint32_t masks[kBucketWords]; + make_find_mask(hash, masks); for (int i = 0; i < kBucketWords; ++i) { - BucketWord hval = (kRehash[i] * hash) >> ((1 << kLogBucketWordBits) - kLogBucketWordBits); - hval = 1U << hval; - if (!(DCHECK_NOTNULL(_directory)[bucket_idx][i] & hval)) { + if ((DCHECK_NOTNULL(_directory)[bucket_idx][i] & masks[i]) == 0) { return false; } } return true; +#endif } void BlockBloomFilter::insert_no_avx2(const uint32_t hash) noexcept {