@@ -549,7 +549,61 @@ rocksdb::Status Bitmap::BitOp(BitOpFlags op_flag, const std::string &op_name, co
549
549
} else {
550
550
memset (frag_res.get (), 0 , frag_maxlen);
551
551
}
552
+ /* Fast path: as far as we have data for all the input bitmaps we
553
+ * can take a fast path that performs much better than the
554
+ * vanilla algorithm.
555
+ * We hope the compiler will generate a better code for memcpy
556
+ * rather than keep this fast path only in ARM machine.
557
+ */
558
+ if (frag_minlen >= sizeof (uint64_t ) * 4 && frag_numkeys <= 16 ) {
559
+ uint8_t *lres = frag_res.get ();
560
+ const uint8_t *lp[16 ];
561
+ for (uint64_t i = 0 ; i < frag_numkeys; i++) {
562
+ lp[i] = reinterpret_cast <const uint8_t *>(fragments[i].data ());
563
+ }
564
+ memcpy (frag_res.get (), fragments[0 ].data (), frag_minlen);
565
+ auto apply_fast_path_op = [&](auto op) {
566
+ // Note: kBitOpNot cannot use this op, it only applying
567
+ // to kBitOpAnd, kBitOpOr, kBitOpXor.
568
+ DCHECK (op_flag != kBitOpNot );
569
+ while (frag_minlen >= sizeof (uint64_t ) * 4 ) {
570
+ uint64_t lres_u64[4 ];
571
+ uint64_t lp_u64[4 ];
572
+ memcpy (lres_u64, lres, sizeof (lres_u64));
573
+ memcpy (lp_u64, lp[0 ], sizeof (lp_u64));
574
+ for (uint64_t i = 1 ; i < frag_numkeys; i++) {
575
+ op (lres, lp[i]);
576
+ op (lres + 8 , lp[i] + 8 );
577
+ op (lres + 8 * 2 , lp[i] + 8 * 2 );
578
+ op (lres + 8 * 3 , lp[i] + 8 * 3 );
579
+ lp[i] += 4 ;
580
+ }
581
+ // memcpy back to lres
582
+ memcpy (lres, lres_u64, sizeof (lres_u64));
583
+ lres += 4 ;
584
+ j += sizeof (uint64_t ) * 4 ;
585
+ frag_minlen -= sizeof (uint64_t ) * 4 ;
586
+ }
587
+ };
552
588
589
+ if (op_flag == kBitOpAnd ) {
590
+ apply_fast_path_op ([](uint64_t &a, const uint64_t &b) { a &= b; });
591
+ } else if (op_flag == kBitOpOr ) {
592
+ apply_fast_path_op ([](uint64_t &a, const uint64_t &b) { a |= b; });
593
+ } else if (op_flag == kBitOpXor ) {
594
+ apply_fast_path_op ([](uint64_t &a, const uint64_t &b) { a ^= b; });
595
+ } else if (op_flag == kBitOpNot ) {
596
+ while (frag_minlen >= sizeof (uint64_t ) * 4 ) {
597
+ lres[0 ] = ~lres[0 ];
598
+ lres[1 ] = ~lres[1 ];
599
+ lres[2 ] = ~lres[2 ];
600
+ lres[3 ] = ~lres[3 ];
601
+ lres += 4 ;
602
+ j += sizeof (uint64_t ) * 4 ;
603
+ frag_minlen -= sizeof (uint64_t ) * 4 ;
604
+ }
605
+ }
606
+ }
553
607
uint8_t output = 0 , byte = 0 ;
554
608
for (; j < frag_maxlen; j++) {
555
609
output = (fragments[0 ].size () <= j) ? 0 : fragments[0 ][j];
0 commit comments