Skip to content

Commit

Permalink
Optimizations for Armv8-A
Browse files Browse the repository at this point in the history
These changes apply only to the AArch64 execution state. They also
add arm64 testing with Travis CI.
  • Loading branch information
volyrique committed Sep 4, 2022
1 parent 066d2b1 commit eb11606
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
language: c
arch:
- amd64
- arm64
compiler:
- gcc
- clang
Expand Down
142 changes: 140 additions & 2 deletions picohttpparser.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@
#include <x86intrin.h>
#endif
#endif
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#include "picohttpparser.h"

#if __GNUC__ >= 3
Expand Down Expand Up @@ -71,9 +77,8 @@
#define ADVANCE_TOKEN(tok, toklen) \
do { \
const char *tok_start = buf; \
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
int found2; \
buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \
buf = findchar_nonprintable_fast(buf, buf_end, &found2); \
if (!found2) { \
CHECK_EOF(); \
} \
Expand Down Expand Up @@ -131,6 +136,69 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
return buf;
}

static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found)
{
#ifdef __ARM_FEATURE_SVE
*found = 0;

for (uint64_t i = 0;; i = svqincb(i, 1)) {
const uint64_t len = buf_end - buf;
const svbool_t pg = svwhilelt_b8(i, len);

if (!svptest_first(svptrue_b8(), pg)) {
buf = buf_end;
break;
}

const svuint8_t v = svld1(pg, (const uint8_t *) buf + i);
const svbool_t c = svorr_z(pg, svcmplt(pg, v, '\041'), svcmpeq(pg, v, '\177'));

if (svptest_any(pg, c)) {
*found = 1;
buf += i + svcntp_b8(pg, svbrkb_z(pg, c));
break;
}
}

return buf;
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
*found = 0;

const size_t block_size = sizeof(uint8x16_t) - 1;
const char * const end = (size_t) (buf_end - buf) >= block_size ? buf_end - block_size : buf;

for (; buf < end; buf += sizeof(uint8x16_t)) {
// This mask makes it possible to pack the comparison result into half a vector,
// which has the same size as uint64_t.
const uint16x8_t mask = vmovq_n_u16(0x0f00);
uint8x16_t v = vld1q_u8((const uint8_t *) buf);

v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177')));
v = vreinterpretq_u8_u16(vbicq_u16(vreinterpretq_u16_u8(v), mask));
// Pack the comparison result into 64 bits.
v = vpmaxq_u8(v, v);

uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v), 0);

if (offset) {
*found = 1;
__asm__ ("rbit %x0, %x0" : "+r" (offset));
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
"Need the number of leading 0-bits in uint64_t.");
// offset uses 4 bits per byte of input.
buf += __builtin_clzll(offset) / 4;
break;
}
}

return buf;
#else
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";

return findchar_fast(buf, buf_end, ranges2, 4, found);
#endif
}

static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret)
{
const char *token_start = buf;
Expand All @@ -143,6 +211,76 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
buf = findchar_fast(buf, buf_end, ranges1, 6, &found);
if (found)
goto FOUND_CTL;
#elif defined(__ARM_FEATURE_SVE)
for (uint64_t i = 0;; i = svqincb(i, 1)) {
const uint64_t len = buf_end - buf;
const svbool_t pg = svwhilelt_b8(i, len);

if (!svptest_first(svptrue_b8(), pg)) {
buf = buf_end;
break;
}

const svuint8_t v = svld1(pg, (const uint8_t *) buf + i);
const uint8_t space = '\040';

if (svptest_any(pg, svcmpge(pg, svsub_x(pg, v, space), 0137u))) {
svbool_t c = svcmpne(svcmplt(pg, v, space), v, '\011');

c = svorr_z(pg, c, svcmpeq(pg, v, '\177'));

if (svptest_any(pg, c)) {
buf += i + svcntp_b8(pg, svbrkb_z(pg, c));
goto FOUND_CTL;
}
}
}
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
const size_t block_size = 2 * sizeof(uint8x16_t) - 1;
const char * const end = (size_t) (buf_end - buf) >= block_size ? buf_end - block_size : buf;

for (; buf < end; buf += 2 * sizeof(uint8x16_t)) {
const uint8x16_t space = vmovq_n_u8('\040');
const uint8x16_t threshold = vmovq_n_u8(0137u);
const uint8x16_t v1 = vld1q_u8((const uint8_t *) buf);
const uint8x16_t v2 = vld1q_u8((const uint8_t *) buf + sizeof(v1));
uint8x16_t v3 = vcgeq_u8(vsubq_u8(v1, space), threshold);
uint8x16_t v4 = vcgeq_u8(vsubq_u8(v2, space), threshold);

v3 = vorrq_u8(v3, v4);
// Pack the comparison result into half a vector, i.e. 64 bits.
v3 = vpmaxq_u8(v3, v3);

if (vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0)) {
const uint8x16_t del = vmovq_n_u8('\177');
// This mask makes it possible to pack the comparison results into half a vector,
// which has the same size as uint64_t.
const uint8x16_t mask = vreinterpretq_u8_u32(vmovq_n_u32(0x40100401));
const uint8x16_t tab = vmovq_n_u8('\011');

v3 = vbicq_u8(vcltq_u8(v1, space), vceqq_u8(v1, tab));
v4 = vbicq_u8(vcltq_u8(v2, space), vceqq_u8(v2, tab));
v3 = vorrq_u8(v3, vceqq_u8(v1, del));
v4 = vorrq_u8(v4, vceqq_u8(v2, del));
// After masking, four consecutive bytes in the results do not have the same bits set.
v3 = vandq_u8(v3, mask);
v4 = vandq_u8(v4, mask);
// Pack the comparison results into 128, and then 64 bits.
v3 = vpaddq_u8(v3, v4);
v3 = vpaddq_u8(v3, v3);

uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0);

if (offset) {
__asm__ ("rbit %x0, %x0" : "+r" (offset));
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
"Need the number of leading 0-bits in uint64_t.");
// offset uses 2 bits per byte of input.
buf += __builtin_clzll(offset) / 2;
goto FOUND_CTL;
}
}
}
#else
/* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
while (likely(buf_end - buf >= 8)) {
Expand Down

0 comments on commit eb11606

Please sign in to comment.