Skip to content

Commit

Permalink
Turbo Base64: Encode/Decode avx512
Browse files Browse the repository at this point in the history
  • Loading branch information
powturbo authored Apr 4, 2023
1 parent 1ce41f0 commit c087948
Showing 1 changed file with 137 additions and 125 deletions.
262 changes: 137 additions & 125 deletions turbob64v512.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,150 @@

#define PREFETCH(_ip_,_i_,_rw_) __builtin_prefetch(_ip_+(_i_),_rw_)

//-------------------- Encode ----------------------------------------------------------------------
//AVX512_VBMI: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=1276,5146,5146,5146&text=_mm512_multishift_epi64_epi8&avx512techs=AVX512_VBMI
//reference: http://0x80.pl/notesen/2016-04-03-avx512-base64.html#avx512vbmi
#define ES512(_i_) { __m512i v0,v1;\
v0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192) ),\
v1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+48));\
u0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u0)), vlut);\
u1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u1)), vlut);\
_mm512_storeu_si512((__m512i*)(op+_i_*256), u0);\
_mm512_storeu_si512((__m512i*)(op+_i_*256+64), u1);\
\
u0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+ 96));\
u1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+144));\
v0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v0)), vlut);\
v1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v1)), vlut);\
_mm512_storeu_si512((__m512i*)(op+_i_*256+128), v0);\
_mm512_storeu_si512((__m512i*)(op+_i_*256+192), v1);\
}

size_t tb64v512enc(const unsigned char* in, size_t inlen, unsigned char *out) {
const unsigned char *ip = in;
unsigned char *op = out;
unsigned outlen = TB64ENCLEN(inlen);

const __m512i vlut = _mm512_setr_epi64(0x4847464544434241ull, 0x504F4E4D4C4B4A49ull, // ABCDEF...789+/
0x5857565554535251ull, 0x6665646362615A59ull,
0x6E6D6C6B6A696867ull, 0x767574737271706Full,
0x333231307A797877ull, 0x2F2B393837363534ull),
vf = _mm512_setr_epi32(0x01020001, 0x04050304, 0x07080607, 0x0a0b090a,
0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516,
0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e),
vs = _mm512_set1_epi64(0x3036242a1016040alu); // 48, 54, 36, 42, 16, 22, 4, 10

#define EN 256
if(outlen >= 128+256) {
__m512i u0 = _mm512_loadu_si512((__m512i *) ip );
__m512i u1 = _mm512_loadu_si512((__m512i *)(ip+48));
for(; op < (out+outlen)-(128+EN); op += EN, ip += EN*3/4) {
ES512(0);
#if EN > 256
ES512(1);
#endif
PREFETCH(ip, 384, 0);
}
#if EN > 256
if(op < (out+outlen)-(128+256)) { ES256(0); op += 256; ip += 256*3/4; }
#endif
}

const __m256i vh = _mm256_set_epi8(10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
for(; op < out+outlen-32; op += 32, ip += 32*3/4) {
__m256i v = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *) ip ) );
v = _mm256_inserti128_si256(v,_mm_loadu_si128((__m128i *)(ip+12)),1);
v = _mm256_shuffle_epi8(v, vh);
v = bitunpack256v8_6(v);
v = bitmap256v8_6(v);
_mm256_storeu_si256((__m256i*) op, v);
}
EXTAIL();
return outlen;
}

//--------------------- Decode ----------------------------------------------------------------------
#define CHECK0(a) a
#ifdef B64CHECK
#define CHECK1(a) a
#else
#define CHECK1(a)
#endif
#if 0 // Not faster than avx2
//----------------------------------------------------------
#define BITMAP256V8_6(iv, ov) ov = _mm512_permutex2var_epi8(vlut0, iv, vlut1); //AVX-512_VBMI

#define BITPACK512V8_6(v) {\
__m512i merge_ab_bc = _mm512_maddubs_epi16(v, _mm512_set1_epi32(0x01400140)),\
vm = _mm512_madd_epi16(merge_ab_bc, _mm512_set1_epi32(0x00011000));\
v = _mm512_permutexvar_epi8(vp, vm);\
}

#define B64CHK(iv, ov, vx) vx = _mm512_ternarylogic_epi32(vx, ov, iv, 0xfe)

#define DS512(_i_) { __m512i iv0,iv1,ou0,ou1,ov0,ov1; \
iv0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256)), \
iv1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+64));\
\
BITMAP256V8_6(iu0, ou0); CHECK0(B64CHK(iu0, ou0, vx)); BITPACK512V8_6(ou0);\
BITMAP256V8_6(iu1, ou1); CHECK1(B64CHK(iu1, ou1, vx)); BITPACK512V8_6(ou1);\
\
iu0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+128)),\
iu1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+192));\
\
_mm512_storeu_si512((__m128i*)(op+_i_*192), ou0);\
_mm512_storeu_si512((__m128i*)(op+_i_*192+48), ou1);\
\
BITMAP256V8_6(iv0, ov0); CHECK1(B64CHK(iv0, ov0, vx)); BITPACK512V8_6(ov0);\
BITMAP256V8_6(iv1, ov1); CHECK1(B64CHK(iv1, ov1, vx)); BITPACK512V8_6(ov1);\
\
_mm512_storeu_si512((__m128i*)(op+_i_*192+ 96), ov0);\
_mm512_storeu_si512((__m128i*)(op+_i_*192+144), ov1);\
}

//-----------------------------------------------
size_t tb64v512dec(const unsigned char *in, size_t inlen, unsigned char *out) {
const unsigned char *ip = in;
unsigned char *op = out;
#define DN 512
__m512i vx = _mm512_setzero_si512();
if(inlen > 56+128) {
const __m512i vlut0 = _mm512_setr_epi32(0x80808080, 0x80808080, 0x80808080, 0x80808080,
0x80808080, 0x80808080, 0x80808080, 0x80808080,
0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080),
vlut1 = _mm512_setr_epi32(0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b,
0x1211100f, 0x16151413, 0x80191817, 0x80808080,
0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080),
vp = _mm512_setr_epi32(0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112,
0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425,
0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
0x00000000, 0x00000000, 0x00000000, 0x00000000);

__m512i iu0 = _mm512_loadu_si512((__m512i *) ip),
iu1 = _mm512_loadu_si512((__m512i *)(ip+64));
for( ; ip < in+(inlen-(DN+4)); ip += DN, op += (DN/4)*3) { PREFETCH(ip,384,0);
DS512(0);
#if DN > 256
DS512(1);
#endif
}
for(; ip < (in+inlen)-64-4; ip += 64, op += 64*3/4) {
__m512i iv = _mm512_loadu_si512((__m512i *) ip), ov;
BITMAP256V8_6(iv, ov);
CHECK0(B64CHK(iv, ov, vx));
BITPACK512V8_6(ov);
_mm512_storeu_si512((__m128i*) op, ov);
}
}
unsigned rc, r = inlen-(ip-in);
if(r && !(rc=tb64xdec(ip, r, op)) || _mm512_movepi8_mask(vx)) return 0;
return (op-out)+rc;
}

#if 0 // AVX512F but Not faster than avx2
#define BITPACK512V8_6_(v) {\
const __m512i merge_ab_and_bc = _mm512_maddubs_epi16(v, _mm512_set1_epi32(0x01400140));\
v = _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));\
Expand Down Expand Up @@ -124,127 +260,3 @@ size_t tb64v512dec0(const unsigned char *in, size_t inlen, unsigned char *out) {
return (op-out)+rc;
}
#endif
//----------------------------------------------------------
#define BITMAP256V8_6(iv, ov) ov = _mm512_permutex2var_epi8(vlut0, iv, vlut1); //AVX-512_VBMI

#define BITPACK512V8_6(v) {\
__m512i merge_ab_bc = _mm512_maddubs_epi16(v, _mm512_set1_epi32(0x01400140)),\
vm = _mm512_madd_epi16(merge_ab_bc, _mm512_set1_epi32(0x00011000));\
v = _mm512_permutexvar_epi8(vp, vm);\
}

#define B64CHK(iv, ov, vx) vx = _mm512_ternarylogic_epi32(vx, ov, iv, 0xfe)

#define DS512(_i_) { __m512i iv0,iv1,ou0,ou1,ov0,ov1;\
iv0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256)), \
iv1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+64));\
BITMAP256V8_6(iu0, ou0); CHECK0(B64CHK(iu0, ou0, vx)); BITPACK512V8_6(ou0);\
BITMAP256V8_6(iu1, ou1); CHECK1(B64CHK(iu1, ou1, vx)); BITPACK512V8_6(ou1);\
iu0 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+128)),\
iu1 = _mm512_loadu_si512((__m512i *)(ip+128+_i_*256+192));\
_mm512_storeu_si512((__m128i*)(op+_i_*192), ou0);\
_mm512_storeu_si512((__m128i*)(op+_i_*192+48), ou1);\
BITMAP256V8_6(iv0, ov0); CHECK1(B64CHK(iv0, ov0, vx)); BITPACK512V8_6(ov0);\
BITMAP256V8_6(iv1, ov1); CHECK1(B64CHK(iv1, ov1, vx)); BITPACK512V8_6(ov1);\
_mm512_storeu_si512((__m128i*)(op+_i_*192+ 96), ov0);\
_mm512_storeu_si512((__m128i*)(op+_i_*192+144), ov1);\
}

//-----------------------------------------------
size_t tb64v512dec(const unsigned char *in, size_t inlen, unsigned char *out) {
const unsigned char *ip = in;
unsigned char *op = out;
#define DN 512
__m512i vx = _mm512_setzero_si512();
if(inlen > 56+128) {
const __m512i vlut0 = _mm512_setr_epi32(0x80808080, 0x80808080, 0x80808080, 0x80808080,
0x80808080, 0x80808080, 0x80808080, 0x80808080,
0x80808080, 0x80808080, 0x3e808080, 0x3f808080,
0x37363534, 0x3b3a3938, 0x80803d3c, 0x80808080),
vlut1 = _mm512_setr_epi32(0x02010080, 0x06050403, 0x0a090807, 0x0e0d0c0b,
0x1211100f, 0x16151413, 0x80191817, 0x80808080,
0x1c1b1a80, 0x201f1e1d, 0x24232221, 0x28272625,
0x2c2b2a29, 0x302f2e2d, 0x80333231, 0x80808080),
vp = _mm512_setr_epi32(0x06000102, 0x090a0405, 0x0c0d0e08, 0x16101112,
0x191a1415, 0x1c1d1e18, 0x26202122, 0x292a2425,
0x2c2d2e28, 0x36303132, 0x393a3435, 0x3c3d3e38,
0x00000000, 0x00000000, 0x00000000, 0x00000000);

__m512i iu0 = _mm512_loadu_si512((__m512i *) ip),
iu1 = _mm512_loadu_si512((__m512i *)(ip+64));
for( ; ip < in+(inlen-(DN+4)); ip += DN, op += (DN/4)*3) { PREFETCH(ip,384,0);
DS512(0);
#if DN > 256
DS512(1);
#endif
}
for(; ip < (in+inlen)-64-4; ip += 64, op += 64*3/4) {
__m512i iv = _mm512_loadu_si512((__m512i *) ip), ov;
BITMAP256V8_6(iv, ov); CHECK0(B64CHK(iv, ov, vx)); BITPACK512V8_6(ov);
_mm512_storeu_si512((__m128i*) op, ov);
}
}
unsigned rc, r = inlen-(ip-in);
if(r && !(rc=tb64xdec(ip, r, op)) || _mm512_movepi8_mask(vx)) return 0;
return (op-out)+rc;
}

//-------------------- Encode ----------------------------------------------------------------------
//AVX512_VBMI: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=1276,5146,5146,5146&text=_mm512_multishift_epi64_epi8&avx512techs=AVX512_VBMI
//reference: http://0x80.pl/notesen/2016-04-03-avx512-base64.html#avx512vbmi
#define ES512(_i_) { __m512i v0,v1;\
v0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192) ),\
v1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+48));\
u0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u0)), vlut);\
u1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, u1)), vlut);\
_mm512_storeu_si512((__m512i*)(op+_i_*256), u0);\
_mm512_storeu_si512((__m512i*)(op+_i_*256+64), u1);\
\
u0 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+ 96));\
u1 = _mm512_loadu_si512((__m512i *)(ip+96+_i_*192+144));\
v0 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v0)), vlut);\
v1 = _mm512_permutexvar_epi8(_mm512_multishift_epi64_epi8(vs, _mm512_permutexvar_epi8(vf, v1)), vlut);\
_mm512_storeu_si512((__m512i*)(op+_i_*256+128), v0);\
_mm512_storeu_si512((__m512i*)(op+_i_*256+192), v1);\
}

size_t tb64v512enc(const unsigned char* in, size_t inlen, unsigned char *out) {
const unsigned char *ip = in;
unsigned char *op = out;
unsigned outlen = TB64ENCLEN(inlen);

static const char *lut = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
const __m512i vlut = _mm512_loadu_si512((const __m512i*)lut);
const __m512i vf = _mm512_setr_epi32(0x01020001, 0x04050304, 0x07080607, 0x0a0b090a,
0x0d0e0c0d, 0x10110f10, 0x13141213, 0x16171516,
0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
const __m512i vs = _mm512_set1_epi64(0x3036242a1016040alu); // 48, 54, 36, 42, 16, 22, 4, 10

#define EN 256
if(outlen >= 128+256) {
__m512i u0 = _mm512_loadu_si512((__m512i *) ip );
__m512i u1 = _mm512_loadu_si512((__m512i *)(ip+48));
for(; op < (out+outlen)-(128+EN); op += EN, ip += EN*3/4) {
ES512(0);
#if EN > 256
ES512(1);
#endif
PREFETCH(ip, 384, 0);
}
#if EN > 256
if(op < (out+outlen)-(128+256)) { ES256(0); op += 256; ip += 256*3/4; }
#endif
}

const __m256i vh = _mm256_set_epi8(10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
10,11, 9,10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
for(; op < out+outlen-32; op += 32, ip += 32*3/4) {
__m256i v = _mm256_castsi128_si256( _mm_loadu_si128((__m128i *) ip ) );
v = _mm256_inserti128_si256(v,_mm_loadu_si128((__m128i *)(ip+12)),1);
v = _mm256_shuffle_epi8(v, vh); v = bitunpack256v8_6(v); v = bitmap256v8_6(v);
_mm256_storeu_si256((__m256i*) op, v);
}
EXTAIL();
return outlen;
}

0 comments on commit c087948

Please sign in to comment.