diff --git a/CMakeLists.txt b/CMakeLists.txt index f236231d..aae312d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,12 +48,13 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) add_compile_options(/W2 /bigobj /Zf) add_compile_definitions(NOMINMAX _USE_MATH_DEFINES WIN32_LEAN_AND_MEAN) else() - add_compile_options(-Wall -Wextra) + add_compile_options(-Wall -Wextra -march=native) endif() include(CTest) else() set(OUSTER_TOP_LEVEL OFF) + add_compile_options(-Wall -Wextra -march=native) endif() # === Subdirectories === diff --git a/ouster_osf/CMakeLists.txt b/ouster_osf/CMakeLists.txt index 9366ef30..0078845c 100644 --- a/ouster_osf/CMakeLists.txt +++ b/ouster_osf/CMakeLists.txt @@ -93,6 +93,7 @@ add_library(ouster_osf STATIC src/compat_ops.cpp src/fb_utils.cpp src/writer.cpp src/async_writer.cpp + src/zpng.cpp src/png_lidarscan_encoder.cpp ) set_property(TARGET ouster_osf PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/ouster_osf/src/fpnge.cc b/ouster_osf/src/fpnge.cc new file mode 100644 index 00000000..c8e3be0b --- /dev/null +++ b/ouster_osf/src/fpnge.cc @@ -0,0 +1,1714 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "fpnge.h" +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) && !defined(__clang__) +#define FORCE_INLINE_LAMBDA +#define FORCE_INLINE __forceinline +#define __SSE4_1__ 1 +#define __PCLMUL__ 1 +#ifdef __AVX2__ +#define __BMI2__ 1 +#endif +#else +#define FORCE_INLINE_LAMBDA __attribute__((always_inline)) +#define FORCE_INLINE __attribute__((always_inline)) inline +#endif + +#if defined(__x86_64__) || defined(__amd64__) || defined(__LP64) || \ + defined(_M_X64) || defined(_M_AMD64) || \ + (defined(_WIN64) && !defined(_M_ARM64)) +#define PLATFORM_AMD64 1 +#endif + +#if !defined(FPNGE_USE_PEXT) +#if defined(__BMI2__) && defined(PLATFORM_AMD64) && \ + !defined(__tune_znver1__) && !defined(__tune_znver2__) && \ + !defined(__tune_bdver4__) +#define FPNGE_USE_PEXT 1 +#else +#define FPNGE_USE_PEXT 0 +#endif +#endif + +#ifdef __AVX2__ +#include +#define MM(f) _mm256_##f +#define MMSI(f) _mm256_##f##_si256 +#define MIVEC __m256i +#define BCAST128 _mm256_broadcastsi128_si256 +// workaround for compilers not supporting _mm256_zextsi128_si256 +#if (defined(__clang__) && __clang_major__ >= 5 && \ + (!defined(__APPLE__) || __clang_major__ >= 7)) || \ + (defined(__GNUC__) && __GNUC__ >= 10) || \ + (defined(_MSC_VER) && _MSC_VER >= 1910) +#define INT2VEC(v) _mm256_zextsi128_si256(_mm_cvtsi32_si128(v)) +#elif defined(__OPTIMIZE__) +// technically incorrect, but should work fine most of the time +#define INT2VEC(v) _mm256_castsi128_si256(_mm_cvtsi32_si128(v)) +#else +// _mm256_insert_epi32 is unavailable on MSVC 19.0, so prefer the following +#define INT2VEC(v) \ + _mm256_inserti128_si256(_mm256_setzero_si256(), _mm_cvtsi32_si128(v), 0); +#endif +#define SIMD_WIDTH 32 +#define SIMD_MASK 0xffffffffU +#elif defined(__SSE4_1__) +#include +#define MM(f) _mm_##f +#define MMSI(f) _mm_##f##_si128 +#define MIVEC __m128i +#define BCAST128(v) (v) +#define INT2VEC _mm_cvtsi32_si128 +#define SIMD_WIDTH 16 +#define SIMD_MASK 0xffffU +#else +#error Requires SSE4.1 support minium +#endif + +namespace { + +alignas(16) constexpr uint8_t kBitReverseNibbleLookup[16] = { + 0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110, + 0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111, +}; + +static constexpr uint8_t kLZ77NBits[29] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, + 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 4, 5, 5, 5, 5, 0}; + +static constexpr uint16_t kLZ77Base[29] = { + 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, + 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258}; + +static uint16_t BitReverse(size_t nbits, uint16_t bits) { + uint16_t rev16 = (kBitReverseNibbleLookup[bits & 0xF] << 12) | + (kBitReverseNibbleLookup[(bits >> 4) & 0xF] << 8) | + (kBitReverseNibbleLookup[(bits >> 8) & 0xF] << 4) | + (kBitReverseNibbleLookup[bits >> 12]); + return rev16 >> (16 - nbits); +} + +struct HuffmanTable { + uint8_t nbits[286]; + uint16_t end_bits; + + alignas(16) uint8_t approx_nbits[16]; + + alignas(16) uint8_t first16_nbits[16]; + alignas(16) uint8_t first16_bits[16]; + + alignas(16) uint8_t last16_nbits[16]; + alignas(16) uint8_t last16_bits[16]; + + alignas(16) uint8_t mid_lowbits[16]; + uint8_t mid_nbits; + + uint32_t lz77_length_nbits[259] = {}; + uint32_t lz77_length_bits[259] = {}; + uint32_t lz77_length_sym[259] = {}; + + uint32_t dist_nbits, dist_bits; + + // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <= + // max_limit[i], so to minimize sum(nbits[i] * freqs[i]). + static void ComputeCodeLengths(const uint64_t *freqs, size_t n, + uint8_t *min_limit, uint8_t *max_limit, + uint8_t *nbits) { + size_t precision = 0; + uint64_t freqsum = 0; + for (size_t i = 0; i < n; i++) { + assert(freqs[i] != 0); + freqsum += freqs[i]; + if (min_limit[i] < 1) + min_limit[i] = 1; + assert(min_limit[i] <= max_limit[i]); + precision = std::max(max_limit[i], precision); + } + uint64_t infty = freqsum * precision; + std::vector dynp(((1U << precision) + 1) * (n + 1), infty); + auto d = [&](size_t sym, size_t off) -> uint64_t & { + return dynp[sym * ((1 << precision) + 1) + off]; + }; + d(0, 0) = 0; + for (size_t sym = 0; sym < n; sym++) { + for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) { + size_t off_delta = 1U << (precision - bits); + for (size_t off = 0; off + off_delta <= (1U << precision); off++) { + d(sym + 1, off + off_delta) = std::min( + d(sym, off) + freqs[sym] * bits, d(sym + 1, off + off_delta)); + } + } + } + + size_t sym = n; + size_t off = 1U << precision; + + while (sym-- > 0) { + assert(off > 0); + for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) { + size_t off_delta = 1U << (precision - bits); + if (off_delta <= off && + d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) { + off -= off_delta; + nbits[sym] = bits; + break; + } + } + } + } + + void ComputeNBits(const uint64_t *collected_data) { + constexpr uint64_t kBaselineData[286] = { + 113, 54, 28, 18, 12, 9, 7, 6, 5, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 5, 6, 7, 9, + 12, 18, 29, 54, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + }; + + uint64_t data[286]; + + for (size_t i = 0; i < 286; i++) { + data[i] = collected_data[i] + kBaselineData[i]; + } + + // Compute Huffman code length ensuring that all the "fake" symbols for [16, + // 240) and [255, 285) have their maximum length. + uint64_t collapsed_data[16 + 14 + 16 + 2] = {}; + uint8_t collapsed_min_limit[16 + 14 + 16 + 2] = {}; + uint8_t collapsed_max_limit[16 + 14 + 16 + 2]; + for (size_t i = 0; i < 48; i++) { + collapsed_max_limit[i] = 8; + } + for (size_t i = 0; i < 16; i++) { + collapsed_data[i] = data[i]; + } + for (size_t i = 0; i < 14; i++) { + collapsed_data[16 + i] = 1; + collapsed_min_limit[16 + i] = 8; + } + for (size_t j = 0; j < 16; j++) { + collapsed_data[16 + 14 + j] += data[240 + j]; + } + collapsed_data[16 + 14 + 16] = 1; + collapsed_min_limit[16 + 14 + 16] = 8; + collapsed_data[16 + 14 + 16 + 1] = data[285]; + + uint8_t collapsed_nbits[48] = {}; + ComputeCodeLengths(collapsed_data, 48, collapsed_min_limit, + collapsed_max_limit, collapsed_nbits); + + // Compute "extra" code lengths for symbols >= 256, except 285. + uint8_t tail_nbits[29] = {}; + uint8_t tail_min_limit[29] = {}; + uint8_t tail_max_limit[29] = {}; + for (size_t i = 0; i < 29; i++) { + tail_min_limit[i] = 4; + tail_max_limit[i] = 7; + } + ComputeCodeLengths(data + 256, 29, tail_min_limit, tail_max_limit, + tail_nbits); + + for (size_t i = 0; i < 16; i++) { + nbits[i] = collapsed_nbits[i]; + } + for (size_t i = 0; i < 14; i++) { + for (size_t j = 0; j < 16; j++) { + nbits[(i + 1) * 16 + j] = collapsed_nbits[16 + i] + 4; + } + } + for (size_t i = 0; i < 16; i++) { + nbits[240 + i] = collapsed_nbits[30 + i]; + } + for (size_t i = 0; i < 29; i++) { + nbits[256 + i] = collapsed_nbits[46] + tail_nbits[i]; + } + nbits[285] = collapsed_nbits[47]; + } + + void ComputeCanonicalCode(const uint8_t *nbits, uint16_t *bits) { + uint8_t code_length_counts[16] = {}; + for (size_t i = 0; i < 286; i++) { + code_length_counts[nbits[i]]++; + } + uint16_t next_code[16] = {}; + uint16_t code = 0; + for (size_t i = 1; i < 16; i++) { + code = (code + code_length_counts[i - 1]) << 1; + next_code[i] = code; + } + for (size_t i = 0; i < 286; i++) { + bits[i] = BitReverse(nbits[i], next_code[nbits[i]]++); + } + } + + void FillNBits() { + for (size_t i = 0; i < 16; i++) { + first16_nbits[i] = nbits[i]; + last16_nbits[i] = nbits[240 + i]; + } + mid_nbits = nbits[16]; + for (size_t i = 16; i < 240; i++) { + assert(nbits[i] == mid_nbits); + } + // Construct lz77 lookup tables. + for (size_t i = 0; i < 29; i++) { + for (size_t j = 0; j < (1U << kLZ77NBits[i]); j++) { + lz77_length_nbits[kLZ77Base[i] + j] = nbits[257 + i] + kLZ77NBits[i]; + lz77_length_sym[kLZ77Base[i] + j] = 257 + i; + } + } + + dist_nbits = 1; + + approx_nbits[0] = + nbits[0] - 1; // subtract 1 as a fudge for catering for RLE + for (size_t i = 1; i < 15; i++) { + approx_nbits[i] = (nbits[i] + nbits[256 - i] + 1) / 2; + } + approx_nbits[15] = mid_nbits; + } + + void FillBits() { + uint16_t bits[286]; + ComputeCanonicalCode(nbits, bits); + for (size_t i = 0; i < 16; i++) { + first16_bits[i] = bits[i]; + last16_bits[i] = bits[240 + i]; + } + mid_lowbits[0] = mid_lowbits[15] = 0; + for (size_t i = 16; i < 240; i += 16) { + mid_lowbits[i / 16] = bits[i] & ((1U << (mid_nbits - 4)) - 1); + } + for (size_t i = 16; i < 240; i++) { + assert((uint32_t(mid_lowbits[i / 16]) | + (kBitReverseNibbleLookup[i % 16] << (mid_nbits - 4))) == bits[i]); + } + end_bits = bits[256]; + // Construct lz77 lookup tables. + for (size_t i = 0; i < 29; i++) { + for (size_t j = 0; j < (1U << kLZ77NBits[i]); j++) { + lz77_length_bits[kLZ77Base[i] + j] = + bits[257 + i] | (j << nbits[257 + i]); + } + } + + dist_bits = 0; + } + + HuffmanTable(const uint64_t *collected_data) { + ComputeNBits(collected_data); + FillNBits(); + FillBits(); + } + + // estimate for CollectSymbolCounts + // only fills nbits; skips computing actual codes + HuffmanTable() { + // the following is similar to ComputeNBits(0, 0, 0 ...), but much faster + constexpr uint8_t collapsed_nbits[] = { + 2, 3, 4, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 5, 5, 4, 3, + + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 8}; + for (size_t i = 0; i < 16; i++) { + nbits[i] = collapsed_nbits[i]; + nbits[240 + i] = collapsed_nbits[16 + i]; + } + for (size_t i = 16; i < 240; i++) { + nbits[i] = 12; + } + for (size_t i = 0; i < 30; i++) { + nbits[256 + i] = collapsed_nbits[32 + i]; + } + + FillNBits(); + } +}; + +struct BitWriter { + void Write(uint32_t count, uint64_t bits) { + buffer |= bits << bits_in_buffer; + bits_in_buffer += count; + memcpy(data + bytes_written, &buffer, 8); + size_t bytes_in_buffer = bits_in_buffer / 8; + bits_in_buffer &= 7; + buffer >>= bytes_in_buffer * 8; + bytes_written += bytes_in_buffer; + } + + void ZeroPadToByte() { + if (bits_in_buffer != 0) { + Write(8 - bits_in_buffer, 0); + } + } + + void WriteBytes(const char *data, size_t count) { + memcpy(this->data + bytes_written, data, count); + bytes_written += count; + } + + unsigned char *data; + size_t bytes_written = 0; + size_t bits_in_buffer = 0; + uint64_t buffer = 0; +}; + +static void WriteHuffmanCode(const HuffmanTable &table, + BitWriter *__restrict writer) { + constexpr uint8_t kCodeLengthNbits[] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, + }; + constexpr uint8_t kCodeLengthOrder[] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15, + }; + writer->Write(5, 29); // all lit/len codes + writer->Write(5, 0); // distance code up to dist, included + writer->Write(4, 15); // all code length codes + for (size_t i = 0; i < 19; i++) { + writer->Write(3, kCodeLengthNbits[kCodeLengthOrder[i]]); + } + + for (size_t i = 0; i < 286; i++) { + writer->Write(4, kBitReverseNibbleLookup[table.nbits[i]]); + } + writer->Write(4, 0b1000); +} + +#ifdef __PCLMUL__ +} // namespace +#include +namespace { +alignas(32) static const uint8_t pshufb_shf_table[] = { + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, + 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f}; + +class Crc32 { + __m128i x0, x1, x2, x3; + + static inline __m128i double_xor(__m128i a, __m128i b, __m128i c) { +#ifdef __AVX512VL__ + return _mm_ternarylogic_epi32(a, b, c, 0x96); +#else + return _mm_xor_si128(_mm_xor_si128(a, b), c); +#endif + } + static inline __m128i do_one_fold(__m128i src, __m128i data) { + const auto k1k2 = _mm_set_epi32(1, 0x54442bd4, 1, 0xc6e41596); + return double_xor(_mm_clmulepi64_si128(src, k1k2, 0x01), + _mm_clmulepi64_si128(src, k1k2, 0x10), data); + } + +public: + Crc32() { + x0 = _mm_cvtsi32_si128(0x9db42487); + x1 = _mm_setzero_si128(); + x2 = _mm_setzero_si128(); + x3 = _mm_setzero_si128(); + } + size_t update(const unsigned char *__restrict data, size_t len) { + auto amount = len & ~63; + for (size_t i = 0; i < amount; i += 64) { + x0 = do_one_fold(x0, _mm_loadu_si128((__m128i *)(data + i))); + x1 = do_one_fold(x1, _mm_loadu_si128((__m128i *)(data + i + 0x10))); + x2 = do_one_fold(x2, _mm_loadu_si128((__m128i *)(data + i + 0x20))); + x3 = do_one_fold(x3, _mm_loadu_si128((__m128i *)(data + i + 0x30))); + } + return amount; + } + uint32_t update_final(const unsigned char *__restrict data, size_t len) { + if (len >= 64) { + update(data, len); + data += len & ~63; + len &= 63; + } + + if (len >= 48) { + auto t3 = x3; + x3 = do_one_fold(x2, _mm_loadu_si128((__m128i *)data + 2)); + x2 = do_one_fold(x1, _mm_loadu_si128((__m128i *)data + 1)); + x1 = do_one_fold(x0, _mm_loadu_si128((__m128i *)data)); + x0 = t3; + } else if (len >= 32) { + auto t2 = x2; + auto t3 = x3; + x3 = do_one_fold(x1, _mm_loadu_si128((__m128i *)data + 1)); + x2 = do_one_fold(x0, _mm_loadu_si128((__m128i *)data)); + x1 = t3; + x0 = t2; + } else if (len >= 16) { + auto t3 = x3; + x3 = do_one_fold(x0, _mm_loadu_si128((__m128i *)data)); + x0 = x1; + x1 = x2; + x2 = t3; + } + data += len & 48; + len &= 15; + + if (len > 0) { + auto xmm_shl = _mm_loadu_si128((__m128i *)(pshufb_shf_table + len)); + auto xmm_shr = _mm_xor_si128(xmm_shl, _mm_set1_epi8(-128)); + + auto t0 = _mm_loadu_si128((__m128i *)data); + auto t1 = _mm_shuffle_epi8(x0, xmm_shl); + + x0 = _mm_or_si128(_mm_shuffle_epi8(x0, xmm_shr), + _mm_shuffle_epi8(x1, xmm_shl)); + x1 = _mm_or_si128(_mm_shuffle_epi8(x1, xmm_shr), + _mm_shuffle_epi8(x2, xmm_shl)); + x2 = _mm_or_si128(_mm_shuffle_epi8(x2, xmm_shr), + _mm_shuffle_epi8(x3, xmm_shl)); + x3 = _mm_or_si128(_mm_shuffle_epi8(x3, xmm_shr), + _mm_shuffle_epi8(t0, xmm_shl)); + + x3 = do_one_fold(t1, x3); + } + + const auto k3k4 = _mm_set_epi32(1, 0x751997d0, 0, 0xccaa009e); + const auto k5k4 = _mm_set_epi32(1, 0x63cd6124, 0, 0xccaa009e); + const auto poly = _mm_set_epi32(1, 0xdb710640, 0, 0xf7011641); + + x0 = double_xor(x1, _mm_clmulepi64_si128(x0, k3k4, 0x10), + _mm_clmulepi64_si128(x0, k3k4, 0x01)); + x0 = double_xor(x2, _mm_clmulepi64_si128(x0, k3k4, 0x10), + _mm_clmulepi64_si128(x0, k3k4, 0x01)); + x0 = double_xor(x3, _mm_clmulepi64_si128(x0, k3k4, 0x10), + _mm_clmulepi64_si128(x0, k3k4, 0x01)); + + x1 = + _mm_xor_si128(_mm_clmulepi64_si128(x0, k5k4, 0), _mm_srli_si128(x0, 8)); + + x0 = _mm_slli_si128(x1, 4); + x0 = _mm_clmulepi64_si128(x0, k5k4, 0x10); +#ifdef __AVX512VL__ + x0 = _mm_ternarylogic_epi32(x0, x1, _mm_set_epi32(0, -1, -1, 0), 0x28); +#else + x1 = _mm_and_si128(x1, _mm_set_epi32(0, -1, -1, 0)); + x0 = _mm_xor_si128(x0, x1); +#endif + + x1 = _mm_clmulepi64_si128(x0, poly, 0); + x1 = _mm_clmulepi64_si128(x1, poly, 0x10); +#ifdef __AVX512VL__ + x1 = _mm_ternarylogic_epi32(x1, x0, x0, 0xC3); // NOT(XOR(x1, x0)) +#else + x0 = _mm_xor_si128(x0, _mm_set_epi32(0, -1, -1, 0)); + x1 = _mm_xor_si128(x1, x0); +#endif + return _mm_extract_epi32(x1, 2); + } +}; +#else +} // namespace +#include +#include +#include +namespace { +// from https://joelfilho.com/blog/2020/compile_time_lookup_tables_in_cpp/ +template +constexpr auto lut_impl(Generator &&f, std::index_sequence) { + using content_type = decltype(f(std::size_t{0})); + return std::array{{f(Indexes)...}}; +} +template +constexpr auto lut(Generator &&f) { + return lut_impl(std::forward(f), + std::make_index_sequence{}); +} + +constexpr uint32_t crc32_slice8_gen(unsigned n) { + uint32_t crc = n & 0xff; + for (int i = n >> 8; i >= 0; i--) { + for (int j = 0; j < 8; j++) { + crc = (crc >> 1) ^ + ((crc & 1) * 0xEDB88320); // 0xEDB88320 = CRC32 polynomial + } + } + return crc; +} +static constexpr auto kCrcSlice8LUT = lut<256 * 8>(crc32_slice8_gen); + +class Crc32 { + uint32_t state; + + // this is based off Fast CRC32 slice-by-8: + // https://create.stephan-brumme.com/crc32/ + static inline uint32_t crc_process_iter(uint32_t crc, + const uint32_t *current) { + uint32_t one = *current++ ^ crc; + uint32_t two = *current; + return kCrcSlice8LUT[(two >> 24) & 0xFF] ^ + kCrcSlice8LUT[0x100 + ((two >> 16) & 0xFF)] ^ + kCrcSlice8LUT[0x200 + ((two >> 8) & 0xFF)] ^ + kCrcSlice8LUT[0x300 + (two & 0xFF)] ^ + kCrcSlice8LUT[0x400 + ((one >> 24) & 0xFF)] ^ + kCrcSlice8LUT[0x500 + ((one >> 16) & 0xFF)] ^ + kCrcSlice8LUT[0x600 + ((one >> 8) & 0xFF)] ^ + kCrcSlice8LUT[0x700 + (one & 0xFF)]; + } + +public: + Crc32() : state(0xffffffff) {} + size_t update(const unsigned char *__restrict data, size_t len) { + auto amount = len & ~7; + for (size_t i = 0; i < amount; i += 8) { + state = crc_process_iter(state, (uint32_t *)(data + i)); + } + return amount; + } + uint32_t update_final(const unsigned char *__restrict data, size_t len) { + auto i = update(data, len); + for (; i < len; i++) { + state = (state >> 8) ^ kCrcSlice8LUT[(state & 0xFF) ^ data[i]]; + } + return ~state; + } +}; + +#endif + +constexpr unsigned kAdler32Mod = 65521; + +static void UpdateAdler32(uint32_t &s1, uint32_t &s2, uint8_t byte) { + s1 += byte; + s2 += s1; + s1 %= kAdler32Mod; + s2 %= kAdler32Mod; +} + +static uint32_t hadd(MIVEC v) { + auto sum = +#ifdef __AVX2__ + _mm_add_epi32(_mm256_castsi256_si128(v), _mm256_extracti128_si256(v, 1)); +#else + v; +#endif + sum = _mm_hadd_epi32(sum, sum); + sum = _mm_hadd_epi32(sum, sum); + return _mm_cvtsi128_si32(sum); +} + +template +static FORCE_INLINE MIVEC PredictVec(const unsigned char *current_buf, + const unsigned char *top_buf, + const unsigned char *left_buf, + const unsigned char *topleft_buf) { + auto data = MMSI(load)((MIVEC *)(current_buf)); + if (predictor == 0) { + return data; + } else if (predictor == 1) { + auto pred = MMSI(loadu)((MIVEC *)(left_buf)); + return MM(sub_epi8)(data, pred); + } else if (predictor == 2) { + auto pred = MMSI(load)((MIVEC *)(top_buf)); + return MM(sub_epi8)(data, pred); + } else if (predictor == 3) { + auto left = MMSI(loadu)((MIVEC *)(left_buf)); + auto top = MMSI(load)((MIVEC *)(top_buf)); + auto pred = MM(sub_epi8)(MM(add_epi8)(top, left), MM(avg_epu8)(top, left)); + return MM(sub_epi8)(data, pred); + } else { + auto left = MMSI(loadu)((MIVEC *)(left_buf)); + auto top = MMSI(load)((MIVEC *)(top_buf)); + auto c = MMSI(loadu)((MIVEC *)(topleft_buf)); + + auto a = MM(min_epu8)(left, top); + auto b = MM(max_epu8)(left, top); + + auto pa = MM(subs_epu8)(b, c); + auto pb = MM(subs_epu8)(c, a); + + auto min_pab = MM(min_epu8)(pa, pb); + auto pc = MM(sub_epi8)(MM(max_epu8)(pa, pb), min_pab); + + auto min_pabc = MM(min_epu8)(min_pab, pc); + auto use_a = MM(cmpeq_epi8)(min_pabc, pa); + auto use_b = MM(cmpeq_epi8)(min_pabc, pb); + + auto pred = MM(blendv_epi8)(MM(blendv_epi8)(c, b, use_b), a, use_a); + return MM(sub_epi8)(data, pred); + } +} + +alignas(SIMD_WIDTH) constexpr int32_t _kMaskVec[] = {0, 0, 0, 0, +#if SIMD_WIDTH == 32 + 0, 0, 0, 0, + -1, -1, -1, -1, +#endif + -1, -1, -1, -1}; +static const uint8_t *kMaskVec = + reinterpret_cast(_kMaskVec) + SIMD_WIDTH; + +template +static void +ProcessRow(size_t bytes_per_line, const unsigned char *current_row_buf, + const unsigned char *top_buf, const unsigned char *left_buf, + const unsigned char *topleft_buf, CB &&cb, CB_ADL &&cb_adl, + CB_RLE &&cb_rle) { + size_t run = 0; + size_t i = 0; + for (; i + SIMD_WIDTH <= bytes_per_line; i += SIMD_WIDTH) { + auto pdata = PredictVec(current_row_buf + i, top_buf + i, + left_buf + i, topleft_buf + i); + unsigned pdatais0 = + MM(movemask_epi8)(MM(cmpeq_epi8)(pdata, MMSI(setzero)())); + if (pdatais0 == SIMD_MASK) { + run += SIMD_WIDTH; + } else { + if (run != 0) { + cb_rle(run); + } + run = 0; + cb(pdata, SIMD_WIDTH); + } + cb_adl(pdata, SIMD_WIDTH, i); + } + size_t bytes_remaining = + bytes_per_line ^ i; // equivalent to `bytes_per_line - i` + if (bytes_remaining) { + auto pdata = PredictVec(current_row_buf + i, top_buf + i, + left_buf + i, topleft_buf + i); + unsigned pdatais0 = + MM(movemask_epi8)(MM(cmpeq_epi8)(pdata, MMSI(setzero)())); + auto mask = (1UL << bytes_remaining) - 1; + + if ((pdatais0 & mask) == mask && run + bytes_remaining >= 16) { + run += bytes_remaining; + } else { + if (run != 0) { + cb_rle(run); + } + run = 0; + cb(pdata, bytes_remaining); + } + cb_adl(pdata, bytes_remaining, i); + } + if (run != 0) { + cb_rle(run); + } +} + +template +static void +ProcessRow(uint8_t predictor, size_t bytes_per_line, + const unsigned char *current_row_buf, const unsigned char *top_buf, + const unsigned char *left_buf, const unsigned char *topleft_buf, + CB &&cb, CB_ADL &&cb_adl, CB_RLE &&cb_rle) { + if (predictor == 1) { + ProcessRow<1>(bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, cb, cb_adl, cb_rle); + } else if (predictor == 2) { + ProcessRow<2>(bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, cb, cb_adl, cb_rle); + } else if (predictor == 3) { + ProcessRow<3>(bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, cb, cb_adl, cb_rle); + } else if (predictor == 4) { + ProcessRow<4>(bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, cb, cb_adl, cb_rle); + } else { + assert(predictor == 0); + ProcessRow<0>(bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, cb, cb_adl, cb_rle); + } +} + +template static void ForAllRLESymbols(size_t length, CB &&cb) { + assert(length >= 4); + length -= 1; + + if (length < 258) { + // fast path if long sequences are rare in the image + cb(length, 1); + } else { + auto runs = length / 258; + auto remain = length % 258; + if (remain == 1 || remain == 2) { + remain += 258 - 3; + runs--; + cb(3, 1); + } + if (runs) { + cb(258, runs); + } + if (remain) { + cb(remain, 1); + } + } +} + +template +static void +TryPredictor(size_t bytes_per_line, const unsigned char *current_row_buf, + const unsigned char *top_buf, const unsigned char *left_buf, + const unsigned char *topleft_buf, unsigned char *predicted_data, + const HuffmanTable &table, size_t &best_cost, uint8_t &predictor) { + size_t cost_rle = 0; + MIVEC cost_direct = MMSI(setzero)(); + auto cost_chunk_cb = [&](const MIVEC bytes, + const size_t bytes_in_vec) FORCE_INLINE_LAMBDA { + auto data_for_lut = MMSI(and)(MM(set1_epi8)(0xF), bytes); + // get a mask of `bytes` that are between -16 and 15 inclusive + // (`-16 <= bytes <= 15` is equivalent to `bytes + 112 > 95`) + auto use_lowhi = MM(cmpgt_epi8)(MM(add_epi8)(bytes, MM(set1_epi8)(112)), + MM(set1_epi8)(95)); + + auto nbits_low16 = MM(shuffle_epi8)( + BCAST128(_mm_load_si128((__m128i *)table.first16_nbits)), data_for_lut); + auto nbits_hi16 = MM(shuffle_epi8)( + BCAST128(_mm_load_si128((__m128i *)table.last16_nbits)), data_for_lut); + + auto nbits = MM(blendv_epi8)(nbits_low16, nbits_hi16, bytes); + nbits = MM(blendv_epi8)(MM(set1_epi8)(table.mid_nbits), nbits, use_lowhi); + + auto nbits_discard = + MMSI(and)(nbits, MMSI(loadu)((MIVEC *)(kMaskVec - bytes_in_vec))); + + cost_direct = + MM(add_epi32)(cost_direct, MM(sad_epu8)(nbits, nbits_discard)); + }; + auto rle_cost_cb = [&](size_t run) { + cost_rle += table.first16_nbits[0]; + ForAllRLESymbols(run, [&](size_t len, size_t count) { + cost_rle += (table.dist_nbits + table.lz77_length_nbits[len]) * count; + }); + }; + if (store_pred) { + ProcessRow( + bytes_per_line, current_row_buf, top_buf, left_buf, topleft_buf, + cost_chunk_cb, + [=](const MIVEC pdata, size_t, size_t i) { + MMSI(store)((MIVEC *)(predicted_data + i), pdata); + }, + rle_cost_cb); + } else { + ProcessRow( + bytes_per_line, current_row_buf, top_buf, left_buf, topleft_buf, + cost_chunk_cb, [](const MIVEC, size_t, size_t) {}, rle_cost_cb); + } + size_t cost = cost_rle + hadd(cost_direct); + if (cost < best_cost) { + best_cost = cost; + predictor = pred; + } +} + +static FORCE_INLINE void WriteBitsLong(MIVEC nbits, MIVEC bits_lo, + MIVEC bits_hi, size_t mid_lo_nbits, + BitWriter *__restrict writer) { + + // Merge bits_lo and bits_hi in 16-bit "bits". +#if FPNGE_USE_PEXT + auto bits0 = MM(unpacklo_epi8)(bits_lo, bits_hi); + auto bits1 = MM(unpackhi_epi8)(bits_lo, bits_hi); + + // convert nbits into a mask + auto nbits_hi = MM(sub_epi8)(nbits, MM(set1_epi8)(mid_lo_nbits)); + auto nbits0 = MM(unpacklo_epi8)(nbits, nbits_hi); + auto nbits1 = MM(unpackhi_epi8)(nbits, nbits_hi); + const auto nbits_to_mask = + BCAST128(_mm_set_epi32(0xffffffff, 0xffffffff, 0x7f3f1f0f, 0x07030100)); + auto bitmask0 = MM(shuffle_epi8)(nbits_to_mask, nbits0); + auto bitmask1 = MM(shuffle_epi8)(nbits_to_mask, nbits1); + + // aggregate nbits + alignas(16) uint16_t nbits_a[SIMD_WIDTH / 4]; + auto bit_count = MM(maddubs_epi16)(nbits, MM(set1_epi8)(1)); +#ifdef __AVX2__ + auto bit_count2 = _mm_hadd_epi16(_mm256_castsi256_si128(bit_count), + _mm256_extracti128_si256(bit_count, 1)); + _mm_store_si128((__m128i *)nbits_a, bit_count2); +#else + bit_count = _mm_hadd_epi16(bit_count, bit_count); + _mm_storel_epi64((__m128i *)nbits_a, bit_count); +#endif + + alignas(SIMD_WIDTH) uint64_t bitmask_a[SIMD_WIDTH / 4]; + MMSI(store)((MIVEC *)bitmask_a, bitmask0); + MMSI(store)((MIVEC *)bitmask_a + 1, bitmask1); + +#else + + auto nbits0 = MM(unpacklo_epi8)(nbits, MMSI(setzero)()); + auto nbits1 = MM(unpackhi_epi8)(nbits, MMSI(setzero)()); + MIVEC bits0, bits1; + if (mid_lo_nbits == 8) { + bits0 = MM(unpacklo_epi8)(bits_lo, bits_hi); + bits1 = MM(unpackhi_epi8)(bits_lo, bits_hi); + } else { + auto nbits_shift = _mm_cvtsi32_si128(8 - mid_lo_nbits); + auto bits_lo_shifted = MM(sll_epi16)(bits_lo, nbits_shift); + bits0 = MM(unpacklo_epi8)(bits_lo_shifted, bits_hi); + bits1 = MM(unpackhi_epi8)(bits_lo_shifted, bits_hi); + + bits0 = MM(srl_epi16)(bits0, nbits_shift); + bits1 = MM(srl_epi16)(bits1, nbits_shift); + } + + // 16 -> 32 + auto nbits0_32_lo = MMSI(and)(nbits0, MM(set1_epi32)(0xFFFF)); + auto nbits1_32_lo = MMSI(and)(nbits1, MM(set1_epi32)(0xFFFF)); + + auto bits0_32_lo = MMSI(and)(bits0, MM(set1_epi32)(0xFFFF)); + auto bits1_32_lo = MMSI(and)(bits1, MM(set1_epi32)(0xFFFF)); +#ifdef __AVX2__ + auto bits0_32_hi = MM(sllv_epi32)(MM(srli_epi32)(bits0, 16), nbits0_32_lo); + auto bits1_32_hi = MM(sllv_epi32)(MM(srli_epi32)(bits1, 16), nbits1_32_lo); +#else + // emulate variable shift by abusing float exponents + // this works because Huffman symbols are not allowed to exceed 15 bits, so + // will fit within a float's mantissa and (number << 15) won't overflow when + // converted back to a signed int + auto bits0_32_hi = + _mm_castps_si128(MM(cvtepi32_ps)(MM(srli_epi32)(bits0, 16))); + auto bits1_32_hi = + _mm_castps_si128(MM(cvtepi32_ps)(MM(srli_epi32)(bits1, 16))); + + // add shift amount to the exponent + bits0_32_hi = MM(add_epi32)(bits0_32_hi, MM(slli_epi32)(nbits0_32_lo, 23)); + bits1_32_hi = MM(add_epi32)(bits1_32_hi, MM(slli_epi32)(nbits1_32_lo, 23)); + + bits0_32_hi = MM(cvtps_epi32)(_mm_castsi128_ps(bits0_32_hi)); + bits1_32_hi = MM(cvtps_epi32)(_mm_castsi128_ps(bits1_32_hi)); +#endif + + nbits0 = MM(madd_epi16)(nbits0, MM(set1_epi16)(1)); + nbits1 = MM(madd_epi16)(nbits1, MM(set1_epi16)(1)); + auto bits0_32 = MMSI(or)(bits0_32_lo, bits0_32_hi); + auto bits1_32 = MMSI(or)(bits1_32_lo, bits1_32_hi); + + // 32 -> 64 +#ifdef __AVX2__ + auto nbits_inv0_64_lo = MM(subs_epu8)(MM(set1_epi64x)(32), nbits0); + auto nbits_inv1_64_lo = MM(subs_epu8)(MM(set1_epi64x)(32), nbits1); + bits0 = MM(sllv_epi32)(bits0_32, nbits_inv0_64_lo); + bits1 = MM(sllv_epi32)(bits1_32, nbits_inv1_64_lo); + bits0 = MM(srlv_epi64)(bits0, nbits_inv0_64_lo); + bits1 = MM(srlv_epi64)(bits1, nbits_inv1_64_lo); +#else + auto nbits0_64_lo = MMSI(and)(nbits0, MM(set1_epi64x)(0xFFFFFFFF)); + auto nbits1_64_lo = MMSI(and)(nbits1, MM(set1_epi64x)(0xFFFFFFFF)); + // just do two shifts for SSE variant + auto bits0_64_lo = MMSI(and)(bits0_32, MM(set1_epi64x)(0xFFFFFFFF)); + auto bits1_64_lo = MMSI(and)(bits1_32, MM(set1_epi64x)(0xFFFFFFFF)); + auto bits0_64_hi = MM(srli_epi64)(bits0_32, 32); + auto bits1_64_hi = MM(srli_epi64)(bits1_32, 32); + + bits0_64_hi = _mm_blend_epi16( + _mm_sll_epi64(bits0_64_hi, nbits0_64_lo), + _mm_sll_epi64(bits0_64_hi, + _mm_unpackhi_epi64(nbits0_64_lo, nbits0_64_lo)), + 0xf0); + bits1_64_hi = _mm_blend_epi16( + _mm_sll_epi64(bits1_64_hi, nbits1_64_lo), + _mm_sll_epi64(bits1_64_hi, + _mm_unpackhi_epi64(nbits1_64_lo, nbits1_64_lo)), + 0xf0); + + bits0 = MMSI(or)(bits0_64_lo, bits0_64_hi); + bits1 = MMSI(or)(bits1_64_lo, bits1_64_hi); +#endif + + auto nbits01 = MM(hadd_epi32)(nbits0, nbits1); + + // nbits_a <= 40 as we have at most 10 bits per symbol, so the call to the + // writer is safe. + alignas(SIMD_WIDTH) uint32_t nbits_a[SIMD_WIDTH / 4]; + MMSI(store)((MIVEC *)nbits_a, nbits01); + +#endif + + alignas(SIMD_WIDTH) uint64_t bits_a[SIMD_WIDTH / 4]; + MMSI(store)((MIVEC *)bits_a, bits0); + MMSI(store)((MIVEC *)bits_a + 1, bits1); + +#ifdef __AVX2__ + constexpr uint8_t kPerm[] = {0, 1, 4, 5, 2, 3, 6, 7}; +#else + constexpr uint8_t kPerm[] = {0, 1, 2, 3}; +#endif + + for (size_t ii = 0; ii < SIMD_WIDTH / 4; ii++) { + uint64_t bits = bits_a[kPerm[ii]]; +#if FPNGE_USE_PEXT + bits = _pext_u64(bits, bitmask_a[kPerm[ii]]); +#endif + auto count = nbits_a[ii]; + writer->Write(count, bits); + } +} + +// as above, but where nbits <= 8, so we can ignore bits_hi +static FORCE_INLINE void WriteBitsShort(MIVEC nbits, MIVEC bits, + BitWriter *__restrict writer) { + +#if FPNGE_USE_PEXT + // convert nbits into a mask + auto bitmask = MM(shuffle_epi8)( + BCAST128(_mm_set_epi32(0xffffffff, 0xffffffff, 0x7f3f1f0f, 0x07030100)), + nbits); + auto bit_count = MM(sad_epu8)(nbits, MMSI(setzero)()); + + alignas(SIMD_WIDTH) uint64_t nbits_a[SIMD_WIDTH / 8]; + MMSI(store)((MIVEC *)nbits_a, bit_count); + alignas(SIMD_WIDTH) uint64_t bits_a[SIMD_WIDTH / 8]; + MMSI(store)((MIVEC *)bits_a, bits); + alignas(SIMD_WIDTH) uint64_t bitmask_a[SIMD_WIDTH / 8]; + MMSI(store)((MIVEC *)bitmask_a, bitmask); +#else + // 8 -> 16 + auto prod = MM(slli_epi16)( + MM(shuffle_epi8)(BCAST128(_mm_set_epi32( + // since we can't handle 8 bits, we'll under-shift + // it and do an extra shift later on + -1, 0xffffff80, 0x40201008, 0x040201ff)), + nbits), + 8); + auto bits_hi = + MM(mulhi_epu16)(MMSI(andnot)(MM(set1_epi16)(0xff), bits), prod); + bits_hi = MM(add_epi16)(bits_hi, bits_hi); // fix under-shifting + bits = MMSI(or)(MMSI(and)(bits, MM(set1_epi16)(0xff)), bits_hi); + nbits = MM(maddubs_epi16)(nbits, MM(set1_epi8)(1)); + + // 16 -> 32 + auto nbits_32_lo = MMSI(and)(nbits, MM(set1_epi32)(0xFFFF)); + auto bits_32_lo = MMSI(and)(bits, MM(set1_epi32)(0xFFFF)); + auto bits_32_hi = MM(srli_epi32)(bits, 16); +#ifdef __AVX2__ + bits_32_hi = MM(sllv_epi32)(bits_32_hi, nbits_32_lo); +#else + // need to avoid overflow when converting float -> int, because it converts to + // a signed int; do this by offsetting the shift by 1 + nbits_32_lo = MM(add_epi16)(nbits_32_lo, MM(set1_epi32)(0xFFFF)); + bits_32_hi = _mm_castps_si128(MM(cvtepi32_ps)(bits_32_hi)); + bits_32_hi = MM(add_epi32)(bits_32_hi, MM(slli_epi32)(nbits_32_lo, 23)); + bits_32_hi = MM(cvtps_epi32)(_mm_castsi128_ps(bits_32_hi)); + bits_32_hi = MM(add_epi32)(bits_32_hi, bits_32_hi); // fix under-shifting +#endif + nbits = MM(madd_epi16)(nbits, MM(set1_epi16)(1)); + bits = MMSI(or)(bits_32_lo, bits_32_hi); + + // 32 -> 64 +#ifdef __AVX2__ + auto nbits_inv_64_lo = MM(subs_epu8)(MM(set1_epi64x)(32), nbits); + bits = MM(sllv_epi32)(bits, nbits_inv_64_lo); + bits = MM(srlv_epi64)(bits, nbits_inv_64_lo); +#else + auto nbits_64_lo = MMSI(and)(nbits, MM(set1_epi64x)(0xFFFFFFFF)); + auto bits_64_lo = MMSI(and)(bits, MM(set1_epi64x)(0xFFFFFFFF)); + auto bits_64_hi = MM(srli_epi64)(bits, 32); + bits_64_hi = _mm_blend_epi16( + _mm_sll_epi64(bits_64_hi, nbits_64_lo), + _mm_sll_epi64(bits_64_hi, _mm_unpackhi_epi64(nbits_64_lo, nbits_64_lo)), + 0xf0); + bits = MMSI(or)(bits_64_lo, bits_64_hi); +#endif + + auto nbits2 = _mm_hadd_epi32( +#ifdef __AVX2__ + _mm256_castsi256_si128(nbits), _mm256_extracti128_si256(nbits, 1) +#else + nbits, nbits +#endif + ); + + alignas(16) uint32_t nbits_a[4]; + alignas(SIMD_WIDTH) uint64_t bits_a[SIMD_WIDTH / 8]; + + _mm_store_si128((__m128i *)nbits_a, nbits2); + MMSI(store)((MIVEC *)bits_a, bits); + +#endif + + for (size_t ii = 0; ii < SIMD_WIDTH / 8; ii++) { + uint64_t bits64 = bits_a[ii]; +#if FPNGE_USE_PEXT + bits64 = _pext_u64(bits64, bitmask_a[ii]); +#endif + if (nbits_a[ii] + writer->bits_in_buffer > 63) { + // hope this case rarely occurs + writer->Write(16, bits64 & 0xffff); + bits64 >>= 16; + nbits_a[ii] -= 16; + } + writer->Write(nbits_a[ii], bits64); + } +} + +static FORCE_INLINE void AddApproxCost(MIVEC &total, MIVEC pdata, + MIVEC bit_costs) { + auto approx_sym = MM(min_epu8)(MM(abs_epi8)(pdata), MM(set1_epi8)(15)); + auto cost = MM(shuffle_epi8)(bit_costs, approx_sym); + total = MM(add_epi64)(total, MM(sad_epu8)(cost, MMSI(setzero)())); +} +static FORCE_INLINE void AddApproxCost(MIVEC &total, MIVEC pdata, + MIVEC bit_costs, MIVEC maskv) { + auto approx_sym = MM(min_epu8)(MM(abs_epi8)(pdata), MM(set1_epi8)(15)); + auto cost = MM(shuffle_epi8)(bit_costs, approx_sym); + auto cost_mask = MMSI(and)(maskv, cost); + total = MM(add_epi64)(total, MM(sad_epu8)(cost, cost_mask)); +} + +static uint8_t +SelectPredictor(size_t bytes_per_line, const unsigned char *current_row_buf, + const unsigned char *top_buf, const unsigned char *left_buf, + const unsigned char *topleft_buf, unsigned char *paeth_data, + const HuffmanTable &table, const struct FPNGEOptions *options) { + if (options->predictor <= 4) { + return options->predictor; + } + if (options->predictor == FPNGE_PREDICTOR_APPROX) { + auto bit_costs = BCAST128(_mm_load_si128((__m128i *)(table.approx_nbits))); + size_t i = 0; + auto cost1 = MMSI(setzero)(); + auto cost2 = MMSI(setzero)(); + auto cost3 = MMSI(setzero)(); + auto cost4 = MMSI(setzero)(); + MIVEC pdata; + + for (; i + SIMD_WIDTH <= bytes_per_line; i += SIMD_WIDTH) { + pdata = PredictVec<1>(current_row_buf + i, top_buf + i, left_buf + i, + topleft_buf + i); + AddApproxCost(cost1, pdata, bit_costs); + + pdata = PredictVec<2>(current_row_buf + i, top_buf + i, left_buf + i, + topleft_buf + i); + AddApproxCost(cost2, pdata, bit_costs); + + pdata = PredictVec<3>(current_row_buf + i, top_buf + i, left_buf + i, + topleft_buf + i); + AddApproxCost(cost3, pdata, bit_costs); + + pdata = PredictVec<4>(current_row_buf + i, top_buf + i, left_buf + i, + topleft_buf + i); + AddApproxCost(cost4, pdata, bit_costs); + MMSI(store)((MIVEC *)(paeth_data + i), pdata); + } + + size_t bytes_remaining = + bytes_per_line ^ i; // equivalent to `bytes_per_line - i` + if (bytes_remaining) { + auto maskv = MMSI(loadu)((MIVEC *)(kMaskVec - bytes_remaining)); + + pdata = PredictVec<1>(current_row_buf + i, top_buf + i, left_buf + i, + topleft_buf + i); + AddApproxCost(cost1, pdata, bit_costs, maskv); + + pdata = PredictVec<2>(current_row_buf + i, top_buf + i, left_buf + i, + topleft_buf + i); + AddApproxCost(cost2, pdata, bit_costs, maskv); + + pdata = PredictVec<3>(current_row_buf + i, top_buf + i, left_buf + i, + topleft_buf + i); + AddApproxCost(cost3, pdata, bit_costs, maskv); + + pdata = PredictVec<4>(current_row_buf + i, top_buf + i, left_buf + i, + topleft_buf + i); + AddApproxCost(cost4, pdata, bit_costs, maskv); + MMSI(store)((MIVEC *)(paeth_data + i), pdata); + } + + uint8_t predictor = 1; + size_t best_cost = hadd(cost1); + auto test_cost = [&](MIVEC costv, uint8_t pred) { + size_t cost = hadd(costv); + if (cost < best_cost) { + best_cost = cost; + predictor = pred; + } + }; + test_cost(cost2, 2); + test_cost(cost3, 3); + test_cost(cost4, 4); + return predictor; + } + + assert(options->predictor == FPNGE_PREDICTOR_BEST); + uint8_t predictor; + size_t best_cost = ~0U; + TryPredictor<1, /*store_pred=*/false>(bytes_per_line, current_row_buf, + top_buf, left_buf, topleft_buf, nullptr, + table, best_cost, predictor); + TryPredictor<2, /*store_pred=*/false>(bytes_per_line, current_row_buf, + top_buf, left_buf, topleft_buf, nullptr, + table, best_cost, predictor); + TryPredictor<3, /*store_pred=*/false>(bytes_per_line, current_row_buf, + top_buf, left_buf, topleft_buf, nullptr, + table, best_cost, predictor); + TryPredictor<4, /*store_pred=*/true>(bytes_per_line, current_row_buf, top_buf, + left_buf, topleft_buf, paeth_data, table, + best_cost, predictor); + return predictor; +} + +static void +EncodeOneRow(size_t bytes_per_line, const unsigned char *current_row_buf, + const unsigned char *top_buf, const unsigned char *left_buf, + const unsigned char *topleft_buf, unsigned char *paeth_data, + const HuffmanTable &table, uint32_t &s1, uint32_t &s2, + BitWriter *__restrict writer, const struct FPNGEOptions *options) { + uint8_t predictor = + SelectPredictor(bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, paeth_data, table, options); + + writer->Write(table.first16_nbits[predictor], table.first16_bits[predictor]); + UpdateAdler32(s1, s2, predictor); + + auto adler_accum_s1 = INT2VEC(s1); + auto adler_accum_s2 = INT2VEC(s2); + auto adler_s1_sum = MMSI(setzero)(); + + uint16_t bytes_since_flush = 1; + + auto flush_adler = [&]() { + adler_accum_s2 = MM(add_epi32)( + adler_accum_s2, MM(slli_epi32)(adler_s1_sum, SIMD_WIDTH == 32 ? 5 : 4)); + adler_s1_sum = MMSI(setzero)(); + + uint32_t ls1 = hadd(adler_accum_s1); + uint32_t ls2 = hadd(adler_accum_s2); + ls1 %= kAdler32Mod; + ls2 %= kAdler32Mod; + s1 = ls1; + s2 = ls2; + adler_accum_s1 = INT2VEC(s1); + adler_accum_s2 = INT2VEC(s2); + bytes_since_flush = 0; + }; + + auto encode_chunk_cb = [&](const MIVEC bytes, const size_t bytes_in_vec) { + auto maskv = MMSI(loadu)((MIVEC *)(kMaskVec - bytes_in_vec)); + + auto data_for_lut = MMSI(and)(MM(set1_epi8)(0xF), bytes); + data_for_lut = MMSI(or)(data_for_lut, maskv); + // get a mask of `bytes` that are between -16 and 15 inclusive + // (`-16 <= bytes <= 15` is equivalent to `bytes + 112 > 95`) + auto use_lowhi = MM(cmpgt_epi8)(MM(add_epi8)(bytes, MM(set1_epi8)(112)), + MM(set1_epi8)(95)); + + auto nbits_low16 = MM(shuffle_epi8)( + BCAST128(_mm_load_si128((__m128i *)table.first16_nbits)), data_for_lut); + auto nbits_hi16 = MM(shuffle_epi8)( + BCAST128(_mm_load_si128((__m128i *)table.last16_nbits)), data_for_lut); + auto nbits = MM(blendv_epi8)(nbits_low16, nbits_hi16, bytes); + + auto bits_low16 = MM(shuffle_epi8)( + BCAST128(_mm_load_si128((__m128i *)table.first16_bits)), data_for_lut); + auto bits_hi16 = MM(shuffle_epi8)( + BCAST128(_mm_load_si128((__m128i *)table.last16_bits)), data_for_lut); + auto bits_lo = MM(blendv_epi8)(bits_low16, bits_hi16, bytes); + + if (MM(movemask_epi8)(use_lowhi) ^ SIMD_MASK) { + auto data_for_midlut = + MMSI(and)(MM(set1_epi8)(0xF), MM(srai_epi16)(bytes, 4)); + + auto bits_mid_lo = MM(shuffle_epi8)( + BCAST128(_mm_load_si128((__m128i *)table.mid_lowbits)), + data_for_midlut); + + auto bits_hi = MM(shuffle_epi8)( + BCAST128(_mm_load_si128((__m128i *)kBitReverseNibbleLookup)), + data_for_lut); + + use_lowhi = MMSI(or)(use_lowhi, maskv); + nbits = MM(blendv_epi8)(MM(set1_epi8)(table.mid_nbits), nbits, use_lowhi); + bits_lo = MM(blendv_epi8)(bits_mid_lo, bits_lo, use_lowhi); + +#if !FPNGE_USE_PEXT + bits_hi = MMSI(andnot)(use_lowhi, bits_hi); +#endif + + WriteBitsLong(nbits, bits_lo, bits_hi, table.mid_nbits - 4, writer); + } else { + // since mid (symbols 16-239) is not present, we can take some shortcuts + // this is expected to occur frequently if compression is effective + WriteBitsShort(nbits, bits_lo, writer); + } + }; + + auto adler_chunk_cb = [&](const MIVEC pdata, size_t bytes_in_vec, size_t) { + bytes_since_flush += bytes_in_vec; + auto bytes = pdata; + + auto muls = MM(set_epi8)( + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +#if SIMD_WIDTH == 32 + , + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +#endif + ); + + if (bytes_in_vec < SIMD_WIDTH) { + adler_accum_s2 = MM(add_epi32)( + MM(mul_epu32)(MM(set1_epi32)(bytes_in_vec), adler_accum_s1), + adler_accum_s2); + bytes = + MMSI(andnot)(MMSI(loadu)((MIVEC *)(kMaskVec - bytes_in_vec)), bytes); + muls = MM(add_epi8)(muls, MM(set1_epi8)(bytes_in_vec - SIMD_WIDTH)); + } else { + adler_s1_sum = MM(add_epi32)(adler_s1_sum, adler_accum_s1); + } + + adler_accum_s1 = + MM(add_epi32)(adler_accum_s1, MM(sad_epu8)(bytes, MMSI(setzero)())); + + auto bytesmuls = MM(maddubs_epi16)(bytes, muls); + adler_accum_s2 = MM(add_epi32)( + adler_accum_s2, MM(madd_epi16)(bytesmuls, MM(set1_epi16)(1))); + + if (bytes_since_flush >= 5500) { + flush_adler(); + } + }; + + auto encode_rle_cb = [&](size_t run) { + writer->Write(table.first16_nbits[0], table.first16_bits[0]); + ForAllRLESymbols(run, [&](size_t len, size_t count) { + uint32_t bits = (table.dist_bits << table.lz77_length_nbits[len]) | + table.lz77_length_bits[len]; + auto nbits = table.lz77_length_nbits[len] + table.dist_nbits; + while (count--) { + writer->Write(nbits, bits); + } + }); + }; + + if (options->predictor > 4 && predictor == 4) { + // re-use Paeth data + ProcessRow<0>(bytes_per_line, paeth_data, nullptr, nullptr, nullptr, + encode_chunk_cb, adler_chunk_cb, encode_rle_cb); + } else { + ProcessRow(predictor, bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, encode_chunk_cb, adler_chunk_cb, encode_rle_cb); + } + + flush_adler(); +} + +static void +CollectSymbolCounts(size_t bytes_per_line, const unsigned char *current_row_buf, + const unsigned char *top_buf, const unsigned char *left_buf, + const unsigned char *topleft_buf, unsigned char *paeth_data, + uint64_t *__restrict symbol_counts, + const struct FPNGEOptions *options) { + + auto encode_chunk_cb = [&](const MIVEC pdata, const size_t bytes_in_vec) { + alignas(SIMD_WIDTH) uint8_t predicted_data[SIMD_WIDTH]; + MMSI(store)((MIVEC *)predicted_data, pdata); + for (size_t i = 0; i < bytes_in_vec; i++) { + symbol_counts[predicted_data[i]] += 1; + } + }; + + auto adler_chunk_cb = [&](const MIVEC, size_t, size_t) {}; + + auto encode_rle_cb = [&](size_t run) { + symbol_counts[0] += 1; + constexpr size_t kLZ77Sym[] = { + 0, 0, 0, 257, 258, 259, 260, 261, 262, 263, 264, 265, 265, 266, + 266, 267, 267, 268, 268, 269, 269, 269, 269, 270, 270, 270, 270, 271, + 271, 271, 271, 272, 272, 272, 272, 273, 273, 273, 273, 273, 273, 273, + 273, 274, 274, 274, 274, 274, 274, 274, 274, 275, 275, 275, 275, 275, + 275, 275, 275, 276, 276, 276, 276, 276, 276, 276, 276, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 278, + 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, + 278, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, + 279, 279, 279, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 281, 281, 281, 281, 281, 281, 281, 281, 281, + 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, + 281, 281, 281, 281, 281, 281, 281, 281, 281, 282, 282, 282, 282, 282, + 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, + 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 283, + 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, + 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, + 283, 283, 283, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, + 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, + 284, 284, 284, 284, 284, 284, 285, + }; + ForAllRLESymbols(run, [&](size_t len, size_t count) { + symbol_counts[kLZ77Sym[len]] += count; + }); + }; + + if (options->predictor == FPNGE_PREDICTOR_APPROX) { + // filter selection here seems to be slightly more effective when using the + // approximate selector; more investigation is probably warranted + HuffmanTable dummy_table; + uint8_t predictor = + SelectPredictor(bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, paeth_data, dummy_table, options); + if (predictor == 4) { + ProcessRow<0>(bytes_per_line, paeth_data, nullptr, nullptr, nullptr, + encode_chunk_cb, adler_chunk_cb, encode_rle_cb); + } else { + ProcessRow(predictor, bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, encode_chunk_cb, adler_chunk_cb, encode_rle_cb); + } + } else { + uint8_t predictor = options->predictor > 4 ? 4 : options->predictor; + ProcessRow(predictor, bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, encode_chunk_cb, adler_chunk_cb, encode_rle_cb); + } +} + +static void AppendBE32(size_t value, BitWriter *__restrict writer) { + writer->Write(8, value >> 24); + writer->Write(8, (value >> 16) & 0xFF); + writer->Write(8, (value >> 8) & 0xFF); + writer->Write(8, value & 0xFF); +} + +static void WriteHeader(size_t width, size_t height, size_t bytes_per_channel, + size_t num_channels, char cicp_colorspace, + const FPNGEAdditionalChunk *additional_chunks, + int num_additional_chunks, + BitWriter *__restrict writer) { + constexpr uint8_t kPNGHeader[8] = {137, 80, 78, 71, 13, 10, 26, 10}; + for (size_t i = 0; i < 8; i++) { + writer->Write(8, kPNGHeader[i]); + } + // Length + writer->Write(32, 0x0d000000); + assert(writer->bits_in_buffer == 0); + size_t crc_start = writer->bytes_written; + // IHDR + writer->Write(32, 0x52444849); + AppendBE32(width, writer); + AppendBE32(height, writer); + // Bit depth + writer->Write(8, bytes_per_channel * 8); + // Colour type + constexpr uint8_t numc_to_colour_type[] = {0, 0, 4, 2, 6}; + writer->Write(8, numc_to_colour_type[num_channels]); + // Compression, filter and interlace methods. + writer->Write(24, 0); + assert(writer->bits_in_buffer == 0); + size_t crc_end = writer->bytes_written; + uint32_t crc = + Crc32().update_final(writer->data + crc_start, crc_end - crc_start); + AppendBE32(crc, writer); + + if (cicp_colorspace == FPNGE_CICP_PQ) { + writer->Write(32, 0x04000000); + writer->Write(32, 0x50434963); // cICP + writer->Write(32, 0x01001009); // PQ, Rec2020 + writer->Write(32, 0xfe23234d); // CRC + } + for (int i = 0; i < num_additional_chunks; i++) { + AppendBE32(additional_chunks[i].data_size, writer); + size_t crc_start = writer->bytes_written; + uint32_t name = 0; + memcpy(&name, additional_chunks[i].name, 4); + writer->Write(32, name); + writer->WriteBytes(additional_chunks[i].data, + additional_chunks[i].data_size); + size_t crc_end = writer->bytes_written; + uint32_t crc = + Crc32().update_final(writer->data + crc_start, crc_end - crc_start); + AppendBE32(crc, writer); + } +} + +void CopyRow(unsigned char *dst, const unsigned char *src, size_t nb_channels, + size_t bytes_per_channel, FPNGEColorChannelOrder order, + size_t width) { + if (order == FPNGE_ORDER_RGB || nb_channels <= 2) { + memcpy(dst, src, nb_channels * bytes_per_channel * width); + return; + } + size_t x = 0; + if (nb_channels == 4 && bytes_per_channel == 1) { + for (; x + SIMD_WIDTH / 4 <= width; x += SIMD_WIDTH / 4) { + auto vec = MMSI(loadu)((MIVEC *)(src + x * 4)); + auto shuf = MM(shuffle_epi8)( + vec, BCAST128(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, + 13, 12, 15))); + MMSI(storeu)((MIVEC *)(dst + x * 4), shuf); + } + } else if (nb_channels == 4 && bytes_per_channel == 2) { + for (; x + SIMD_WIDTH / 8 <= width; x += SIMD_WIDTH / 8) { + auto vec = MMSI(loadu)((MIVEC *)(src + x * 8)); + auto shuf = MM(shuffle_epi8)( + vec, BCAST128(_mm_setr_epi8(4, 5, 2, 3, 0, 1, 6, 7, 12, 13, 10, 11, 8, + 9, 14, 15))); + MMSI(storeu)((MIVEC *)(dst + x * 8), shuf); + } + } else if (nb_channels == 3 && bytes_per_channel == 1) { + for (; x + 16 <= width; x += 16) { + auto vec1 = _mm_loadu_si128((__m128i *)(src + x * 3)); + auto vec2 = _mm_loadu_si128((__m128i *)(src + x * 3 + 16)); + auto vec3 = _mm_loadu_si128((__m128i *)(src + x * 3 + 32)); + auto s1a = + _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, -1); + auto s1b = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, 1); + auto shuf1 = _mm_or_si128(_mm_shuffle_epi8(vec1, s1a), + _mm_shuffle_epi8(vec2, s1b)); + auto s2a = _mm_setr_epi8(-1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1); + auto s2b = + _mm_setr_epi8(0, -1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, -1, 15); + auto s2c = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 0, -1); + auto shuf2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(vec1, s2a), + _mm_shuffle_epi8(vec2, s2b)), + _mm_shuffle_epi8(vec3, s2c)); + auto s3b = _mm_setr_epi8(14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1); + auto s3c = + _mm_setr_epi8(-1, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13); + auto shuf3 = _mm_or_si128(_mm_shuffle_epi8(vec2, s3b), + _mm_shuffle_epi8(vec3, s3c)); + + _mm_storeu_si128((__m128i *)(dst + x * 3), shuf1); + _mm_storeu_si128((__m128i *)(dst + x * 3 + 16), shuf2); + _mm_storeu_si128((__m128i *)(dst + x * 3 + 32), shuf3); + } + } else if (nb_channels == 3 && bytes_per_channel == 2) { + for (; x + 8 <= width; x += 8) { + auto vec1 = _mm_loadu_si128((__m128i *)(src + x * 6)); + auto vec2 = _mm_loadu_si128((__m128i *)(src + x * 6 + 16)); + auto vec3 = _mm_loadu_si128((__m128i *)(src + x * 6 + 32)); + auto s1a = + _mm_setr_epi8(4, 5, 2, 3, 0, 1, 10, 11, 8, 9, 6, 7, -1, -1, 14, 15); + auto s1b = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, -1, -1); + auto shuf1 = _mm_or_si128(_mm_shuffle_epi8(vec1, s1a), + _mm_shuffle_epi8(vec2, s1b)); + auto s2a = _mm_setr_epi8(12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1); + auto s2b = + _mm_setr_epi8(-1, -1, 6, 7, 4, 5, 2, 3, 12, 13, 10, 11, 8, 9, -1, -1); + auto s2c = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, 2, 3); + auto shuf2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(vec1, s2a), + _mm_shuffle_epi8(vec2, s2b)), + _mm_shuffle_epi8(vec3, s2c)); + auto s3b = _mm_setr_epi8(-1, -1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1); + auto s3c = + _mm_setr_epi8(0, 1, -1, -1, 8, 9, 6, 7, 4, 5, 14, 15, 12, 13, 10, 11); + auto shuf3 = _mm_or_si128(_mm_shuffle_epi8(vec2, s3b), + _mm_shuffle_epi8(vec3, s3c)); + + _mm_storeu_si128((__m128i *)(dst + x * 6), shuf1); + _mm_storeu_si128((__m128i *)(dst + x * 6 + 16), shuf2); + _mm_storeu_si128((__m128i *)(dst + x * 6 + 32), shuf3); + } + } + for (; x < width; x++) { + if (nb_channels == 3 && bytes_per_channel == 1) { + dst[x * 3] = src[x * 3 + 2]; + dst[x * 3 + 1] = src[x * 3 + 1]; + dst[x * 3 + 2] = src[x * 3]; + } + if (nb_channels == 3 && bytes_per_channel == 2) { + dst[x * 6] = src[x * 6 + 4]; + dst[x * 6 + 1] = src[x * 6 + 3]; + dst[x * 6 + 2] = src[x * 6 + 2]; + dst[x * 6 + 3] = src[x * 6 + 3]; + dst[x * 6 + 4] = src[x * 6]; + dst[x * 6 + 5] = src[x * 6 + 1]; + } + if (nb_channels == 4 && bytes_per_channel == 1) { + dst[x * 4] = src[x * 4 + 2]; + dst[x * 4 + 1] = src[x * 4 + 1]; + dst[x * 4 + 2] = src[x * 4]; + dst[x * 4 + 3] = src[x * 4 + 3]; + } + if (nb_channels == 4 && bytes_per_channel == 2) { + dst[x * 8] = src[x * 8 + 4]; + dst[x * 8 + 1] = src[x * 8 + 3]; + dst[x * 8 + 2] = src[x * 8 + 2]; + dst[x * 8 + 3] = src[x * 8 + 3]; + dst[x * 8 + 4] = src[x * 8]; + dst[x * 8 + 5] = src[x * 8 + 1]; + dst[x * 8 + 6] = src[x * 8 + 6]; + dst[x * 8 + 7] = src[x * 8 + 7]; + } + } +} + +} // namespace + +extern "C" size_t FPNGEEncode(size_t bytes_per_channel, size_t num_channels, + const void *data, size_t width, size_t row_stride, + size_t height, void *output, + const struct FPNGEOptions *options) { + assert(bytes_per_channel == 1 || bytes_per_channel == 2); + assert(num_channels != 0 && num_channels <= 4); + size_t bytes_per_line = bytes_per_channel * num_channels * width; + assert(row_stride >= bytes_per_line); + + // allows for padding, and for extra initial space for the "left" pixel for + // predictors. + size_t bytes_per_line_buf = + (bytes_per_line + 4 * bytes_per_channel + SIMD_WIDTH - 1) / SIMD_WIDTH * + SIMD_WIDTH; + + // Extra space for alignment purposes. + std::vector buf(bytes_per_line_buf * 2 + SIMD_WIDTH - 1 + + 4 * bytes_per_channel); + unsigned char *aligned_buf_ptr = buf.data() + 4 * bytes_per_channel; + aligned_buf_ptr += (intptr_t)aligned_buf_ptr % SIMD_WIDTH + ? (SIMD_WIDTH - (intptr_t)aligned_buf_ptr % SIMD_WIDTH) + : 0; + + std::vector pdata_buf(bytes_per_line_buf + SIMD_WIDTH - 1); + unsigned char *aligned_pdata_ptr = pdata_buf.data(); + aligned_pdata_ptr += + (intptr_t)aligned_pdata_ptr % SIMD_WIDTH + ? (SIMD_WIDTH - (intptr_t)aligned_pdata_ptr % SIMD_WIDTH) + : 0; + + struct FPNGEOptions default_options; + if (options == nullptr) { + FPNGEFillOptions(&default_options, FPNGE_COMPRESS_LEVEL_DEFAULT, + FPNGE_CICP_NONE); + options = &default_options; + } + + // options sanity check + assert(options->predictor >= 0 && options->predictor <= FPNGE_PREDICTOR_BEST); + assert(options->huffman_sample >= 0 && options->huffman_sample <= 127); + + BitWriter writer; + writer.data = static_cast(output); + + WriteHeader(width, height, bytes_per_channel, num_channels, + options->cicp_colorspace, options->additional_chunks, + options->num_additional_chunks, &writer); + + assert(writer.bits_in_buffer == 0); + size_t chunk_length_pos = writer.bytes_written; + writer.bytes_written += 4; // Skip space for length. + size_t crc_pos = writer.bytes_written; + writer.Write(32, 0x54414449); // IDAT + // Deflate header + writer.Write(8, 8); // deflate with smallest window + writer.Write(8, 29); // cfm+flg check value + + uint64_t symbol_counts[286] = {}; + + // Sample rows in the center of the image. + size_t y0 = height * (127 - options->huffman_sample) / 256; + size_t y1 = height * (129 + options->huffman_sample) / 256; + if (y1 == 0 && height > 0) { // for 1 pixel high images + y1 = 1; + } + + for (size_t y = y0; y < y1; y++) { + const unsigned char *current_row_in = + static_cast(data) + row_stride * y; + unsigned char *current_row_buf = + aligned_buf_ptr + (y % 2 ? bytes_per_line_buf : 0); + const unsigned char *top_buf = + aligned_buf_ptr + ((y + 1) % 2 ? bytes_per_line_buf : 0); + const unsigned char *left_buf = + current_row_buf - bytes_per_channel * num_channels; + const unsigned char *topleft_buf = + top_buf - bytes_per_channel * num_channels; + + CopyRow(current_row_buf, current_row_in, num_channels, bytes_per_channel, + (FPNGEColorChannelOrder)options->channel_order, width); + if (y == y0 && y != 0) { + continue; + } + + CollectSymbolCounts(bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, aligned_pdata_ptr, symbol_counts, options); + } + + memset(buf.data(), 0, buf.size()); + + HuffmanTable huffman_table(symbol_counts); + + // Single block, dynamic huffman + writer.Write(3, 0b101); + WriteHuffmanCode(huffman_table, &writer); + + Crc32 crc; + uint32_t s1 = 1; + uint32_t s2 = 0; + for (size_t y = 0; y < height; y++) { + const unsigned char *current_row_in = + static_cast(data) + row_stride * y; + unsigned char *current_row_buf = + aligned_buf_ptr + (y % 2 ? bytes_per_line_buf : 0); + const unsigned char *top_buf = + aligned_buf_ptr + ((y + 1) % 2 ? bytes_per_line_buf : 0); + const unsigned char *left_buf = + current_row_buf - bytes_per_channel * num_channels; + const unsigned char *topleft_buf = + top_buf - bytes_per_channel * num_channels; + + CopyRow(current_row_buf, current_row_in, num_channels, bytes_per_channel, + (FPNGEColorChannelOrder)options->channel_order, width); + + EncodeOneRow(bytes_per_line, current_row_buf, top_buf, left_buf, + topleft_buf, aligned_pdata_ptr, huffman_table, s1, s2, &writer, + options); + + crc_pos += + crc.update(writer.data + crc_pos, writer.bytes_written - crc_pos); + } + + // EOB + writer.Write(huffman_table.nbits[256], huffman_table.end_bits); + + writer.ZeroPadToByte(); + assert(writer.bits_in_buffer == 0); + s1 %= kAdler32Mod; + s2 %= kAdler32Mod; + uint32_t adler32 = (s2 << 16) | s1; + AppendBE32(adler32, &writer); + + size_t data_len = writer.bytes_written - chunk_length_pos - 8; + writer.data[chunk_length_pos + 0] = data_len >> 24; + writer.data[chunk_length_pos + 1] = (data_len >> 16) & 0xFF; + writer.data[chunk_length_pos + 2] = (data_len >> 8) & 0xFF; + writer.data[chunk_length_pos + 3] = data_len & 0xFF; + + auto final_crc = + crc.update_final(writer.data + crc_pos, writer.bytes_written - crc_pos); + AppendBE32(final_crc, &writer); + + // IEND + writer.Write(32, 0); + writer.Write(32, 0x444e4549); + writer.Write(32, 0x826042ae); + + return writer.bytes_written; +} diff --git a/ouster_osf/src/fpnge.h b/ouster_osf/src/fpnge.h new file mode 100644 index 00000000..77db8a76 --- /dev/null +++ b/ouster_osf/src/fpnge.h @@ -0,0 +1,102 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef FPNGE_H +#define FPNGE_H +#include + +#ifdef __cplusplus +extern "C" { +#endif + +enum FPNGECicpColorspace { FPNGE_CICP_NONE, FPNGE_CICP_PQ }; + +enum FPNGEOptionsPredictor { + FPNGE_PREDICTOR_FIXED_NOOP, + FPNGE_PREDICTOR_FIXED_SUB, + FPNGE_PREDICTOR_FIXED_TOP, + FPNGE_PREDICTOR_FIXED_AVG, + FPNGE_PREDICTOR_FIXED_PAETH, + FPNGE_PREDICTOR_APPROX, + FPNGE_PREDICTOR_BEST +}; + +enum FPNGEColorChannelOrder { + FPNGE_ORDER_RGB, + FPNGE_ORDER_BGR, +}; + +struct FPNGEAdditionalChunk { + char name[4]; + const char *data; + int data_size; +}; + +struct FPNGEOptions { + char predictor; // FPNGEOptionsPredictor + char huffman_sample; // 0-127: how much of the image to sample + char cicp_colorspace; // FPNGECicpColorspace + char channel_order; // FPNGEColorChannelOrder + int num_additional_chunks; + const struct FPNGEAdditionalChunk *additional_chunks; +}; + +#define FPNGE_COMPRESS_LEVEL_DEFAULT 4 +#define FPNGE_COMPRESS_LEVEL_BEST 5 +inline void FPNGEFillOptions(struct FPNGEOptions *options, int level, + int cicp_colorspace) { + if (level == 0) + level = FPNGE_COMPRESS_LEVEL_DEFAULT; + options->cicp_colorspace = cicp_colorspace; + options->huffman_sample = 1; + options->num_additional_chunks = 0; + options->additional_chunks = NULL; + options->channel_order = FPNGE_ORDER_RGB; + switch (level) { + case 1: + options->predictor = 2; + break; + case 2: + options->predictor = 4; + break; + case 3: + options->predictor = 5; + break; + case 5: + options->huffman_sample = 23; + // fall through + default: + options->predictor = 6; + break; + } +} + +// bytes_per_channel = 1/2 for 8-bit and 16-bit. num_channels: 1/2/3/4 +// (G/GA/RGB/RGBA) +size_t FPNGEEncode(size_t bytes_per_channel, size_t num_channels, + const void *data, size_t width, size_t row_stride, + size_t height, void *output, + const struct FPNGEOptions *options); + +inline size_t FPNGEOutputAllocSize(size_t bytes_per_channel, + size_t num_channels, size_t width, + size_t height) { + // likely an overestimate + return 1024 + (2 * bytes_per_channel * width * num_channels + 1) * height; +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/ouster_osf/src/png_lidarscan_encoder.cpp b/ouster_osf/src/png_lidarscan_encoder.cpp index 7374cf91..2f1309c2 100644 --- a/ouster_osf/src/png_lidarscan_encoder.cpp +++ b/ouster_osf/src/png_lidarscan_encoder.cpp @@ -9,11 +9,26 @@ #include "ouster/osf/png_lidarscan_encoder.h" +#define QOI_IMPLEMENTATION +#include "qoi.h" + #include #include "ouster/impl/logging.h" #include "png_tools.h" +#include + +#include + +#define QOIR_IMPLEMENTATION +//#include "qoir.h" + +#include "fpnge.h" +#include "fpnge.cc" + +#include "zpng.h" + /* Check for the older version of libpng and add missing macros as necessary */ #if (PNG_LIBPNG_VER_MAJOR == 1) @@ -43,6 +58,157 @@ bool PngLidarScanEncoder::fieldEncode(const LidarScan& lidar_scan, "ERROR: scan_data size is not sufficient to hold idx: " + std::to_string(scan_idx)); } + + /*FPNGEOptions options; + options.channel_order = 0; + options.huffman_sample = 10; + options.cicp_colorspace = 0; + options.num_additional_chunks = 0; + options.predictor = FPNGE_PREDICTOR_BEST; + FPNGEFillOptions(&options, 5, 0); + + auto field = lidar_scan.field(field_type.name); + auto data = field.get(); + + int channels = 1; + int bytewidth = 1; + switch (field_type.element_type) { + case sensor::ChanFieldType::UINT8: + break; + case sensor::ChanFieldType::UINT16: + bytewidth = 2; + break; + case sensor::ChanFieldType::UINT32: + channels = 4; + break; + case sensor::ChanFieldType::UINT64: + break; + } + int stride = lidar_scan.w * channels * bytewidth; + auto buf_size = FPNGEOutputAllocSize(bytewidth, channels, lidar_scan.w, lidar_scan.h); + auto output = malloc(buf_size); + auto len = FPNGEEncode(bytewidth, channels, data, lidar_scan.w, stride, lidar_scan.h, output, &options); + scan_data[scan_idx].resize(len); + memcpy(scan_data[scan_idx].data(), output, len); + free(output); + return false;*/ + + ZPNG_ImageData opts; + opts.HeightPixels = lidar_scan.h; + opts.WidthPixels = lidar_scan.w; + + auto field = lidar_scan.field(field_type.name); + auto data = field.get(); + + opts.Buffer.Data = (unsigned char*)data; + opts.Buffer.Bytes = field.bytes(); + + int channels = 1; + int bytewidth = 1; + switch (field_type.element_type) { + case sensor::ChanFieldType::UINT8: + break; + case sensor::ChanFieldType::UINT16: + bytewidth = 2; + break; + case sensor::ChanFieldType::UINT32: + channels = 4; + break; + case sensor::ChanFieldType::UINT64: + break; + } + + opts.BytesPerChannel = bytewidth; + opts.Channels = channels; + int stride = lidar_scan.w * channels * bytewidth; + opts.StrideBytes = stride; + + auto out = ZPNG_Compress(&opts); + + auto len = out.Bytes; + scan_data[scan_idx].resize(len); + memcpy(scan_data[scan_idx].data(), out.Data, len); + ZPNG_Free(&out); + return false; + + /*qoi_desc desc; + desc.width = lidar_scan.w/4; + desc.height = lidar_scan.h; + desc.channels = 4; + desc.colorspace = 0; + + switch (field_type.element_type) { + case sensor::ChanFieldType::UINT8: + desc.width *= 4; + desc.channels = 1; + break; + case sensor::ChanFieldType::UINT16: + desc.height *= 2; + break; + case sensor::ChanFieldType::UINT32: + desc.height *= 4; + break; + case sensor::ChanFieldType::UINT64: + desc.height *= 8; + break; + } + + int qlen; + void* result = qoi_encode(lidar_scan.field(field_type.name).get(), &desc, &qlen); + + auto field = lidar_scan.field(field_type.name); + auto bytes = qlen; + auto len = LZ4_compressBound(bytes); + scan_data[scan_idx].resize(len); + auto real_len = LZ4_compress_default((char*)result, (char*)scan_data[scan_idx].data(), bytes, len); + scan_data[scan_idx].resize(real_len); + free(result);*/ + + /*qoi_desc desc; + desc.width = lidar_scan.w/4; + desc.height = lidar_scan.h; + desc.channels = 4; + desc.colorspace = 0; + + switch (field_type.element_type) { + case sensor::ChanFieldType::UINT8: + desc.width *= 4; + desc.channels = 1; + break; + case sensor::ChanFieldType::UINT16: + desc.height *= 2; + break; + case sensor::ChanFieldType::UINT32: + desc.height *= 4; + break; + case sensor::ChanFieldType::UINT64: + desc.height *= 8; + break; + } + + int len; + void* result = qoi_encode(lidar_scan.field(field_type.name).get(), &desc, &len); + bool res = true; + scan_data[scan_idx].resize(len); + memcpy(scan_data[scan_idx].data(), result, len); + free(result);*/ + + /*auto field = lidar_scan.field(field_type.name); + auto bytes = field.bytes(); + auto len = LZ4_compressBound(bytes); + scan_data[scan_idx].resize(len); + auto real_len = LZ4_compress_default((char*)field.get(), (char*)scan_data[scan_idx].data(), bytes, len); + scan_data[scan_idx].resize(real_len);*/ + + /*auto field = lidar_scan.field(field_type.name); + auto bytes = field.bytes(); + auto len = compressBound(bytes); + scan_data[scan_idx].resize(len); + auto real_len = len; + compress((unsigned char*)scan_data[scan_idx].data(), &real_len, (unsigned char*)field.get(), bytes); + scan_data[scan_idx].resize(real_len);*/ + //printf("%li vs %li %f\n", bytes, real_len, (float)real_len/(float)bytes); + bool res = true; switch (field_type.element_type) { case sensor::ChanFieldType::UINT8: @@ -76,7 +242,7 @@ bool PngLidarScanEncoder::fieldEncode(const LidarScan& lidar_scan, logger().error("ERROR: fieldEncode: Can't encode field {}", field_type.name); } - return res; + return false; } ScanChannelData PngLidarScanEncoder::encodeField( @@ -141,6 +307,28 @@ bool PngLidarScanEncoder::encode8bitImage( ScanChannelData& res_buf, const Eigen::Ref>& img) const { const uint32_t width = static_cast(img.cols()); const uint32_t height = static_cast(img.rows()); + + FPNGEOptions options; + options.channel_order = 0; + options.huffman_sample = 10; + options.cicp_colorspace = 0; + options.num_additional_chunks = 0; + options.predictor = FPNGE_PREDICTOR_BEST; + FPNGEFillOptions(&options, 5, 0); + + auto data = img.data(); + + int channels = 1; + int bytewidth = 1; + int stride = width * channels * bytewidth; + auto buf_size = FPNGEOutputAllocSize(bytewidth, channels, width, height); + auto output = malloc(buf_size); + auto len = FPNGEEncode(bytewidth, channels, data, width, stride, height, output, &options); + res_buf.resize(len); + memcpy(res_buf.data(), output, len); + free(output); + + return false; // 8 bit Gray const int sample_depth = 8; @@ -188,6 +376,36 @@ bool PngLidarScanEncoder::encode16bitImage( const uint32_t width = static_cast(img.cols()); const uint32_t height = static_cast(img.rows()); + FPNGEOptions options; + options.channel_order = 0; + options.huffman_sample = 10; + options.cicp_colorspace = 0; + options.num_additional_chunks = 0; + options.predictor = FPNGE_PREDICTOR_BEST; + FPNGEFillOptions(&options, 5, 0); + + std::vector data2(height*width*2); + for (size_t u = 0; u < height; ++u) { + for (size_t v = 0; v < width; ++v) { + const uint64_t key_val = img(u, v); + + // 16bit Encoding Logic + data2[u*width*2 + v * 2 + 1] = static_cast(key_val & 0xff); + data2[u*width*2 + v * 2 + 0] = static_cast((key_val >> 8u) & 0xff); + } + } + + int channels = 1; + int bytewidth = 2; + int stride = width * channels * bytewidth; + auto buf_size = FPNGEOutputAllocSize(bytewidth, channels, width, height); + res_buf.resize(buf_size); + auto output = res_buf.data(); + auto len = FPNGEEncode(bytewidth, channels, data2.data(), width, stride, height, output, &options); + res_buf.resize(len); + + return false; + // 16 bit Gray const int sample_depth = 16; const int color_type = PNG_COLOR_TYPE_GRAY; @@ -312,6 +530,28 @@ bool PngLidarScanEncoder::encode32bitImage( const uint32_t width = static_cast(img.cols()); const uint32_t height = static_cast(img.rows()); + FPNGEOptions options; + options.channel_order = 0; + options.huffman_sample = 10; + options.cicp_colorspace = 0; + options.num_additional_chunks = 0; + options.predictor = FPNGE_PREDICTOR_BEST; + FPNGEFillOptions(&options, 5, 0); + + auto data = img.data(); + + int channels = 4; + int bytewidth = 1; + int stride = width * channels * bytewidth; + auto buf_size = FPNGEOutputAllocSize(bytewidth, channels, width, height); + auto output = malloc(buf_size); + auto len = FPNGEEncode(bytewidth, channels, data, width, stride, height, output, &options); + res_buf.resize(len); + memcpy(res_buf.data(), output, len); + free(output); + + return false; + // 8bit RGBA const int sample_depth = 8; const int color_type = PNG_COLOR_TYPE_RGB_ALPHA; diff --git a/ouster_osf/src/png_tools.cpp b/ouster_osf/src/png_tools.cpp index 4999394d..07dbe420 100644 --- a/ouster_osf/src/png_tools.cpp +++ b/ouster_osf/src/png_tools.cpp @@ -11,9 +11,13 @@ #include #include +#include "qoi.h" + #include "ouster/impl/logging.h" #include "ouster/lidar_scan.h" +#include "zpng.h" + using namespace ouster::sensor; namespace ouster { @@ -179,6 +183,19 @@ void png_osf_write_start(png_structp png_ptr, png_infop png_info_ptr, bool fieldDecode(LidarScan& lidar_scan, const ScanData& scan_data, size_t start_idx, const ouster::FieldType& field_type, const std::vector& px_offset) { + ZPNG_Buffer buffer; + buffer.Bytes = scan_data[start_idx].size(); + buffer.Data = (unsigned char*)scan_data[start_idx].data(); + auto out = ZPNG_Decompress(buffer); + + if (out.Buffer.Data) { + // we can avoid a copy here if we want to microoptimize by changing zpng decompress + auto& field = lidar_scan.field(field_type.name); + memcpy(field.get(), out.Buffer.Data, out.Buffer.Bytes); + ZPNG_Free(&out.Buffer); + return false; + } + switch (field_type.element_type) { case sensor::ChanFieldType::UINT8: return decode8bitImage(lidar_scan.field(field_type.name), diff --git a/ouster_osf/src/qoi.h b/ouster_osf/src/qoi.h new file mode 100644 index 00000000..bdf7a0f7 --- /dev/null +++ b/ouster_osf/src/qoi.h @@ -0,0 +1,1079 @@ +/* + +QOI - The "Quite OK Image" format for fast, lossless image compression + +Dominic Szablewski - https://phoboslab.org + +Greyscale Additions - Jay Stavnitzky + +-- LICENSE: The MIT License(MIT) + +Copyright(c) 2021 Dominic Szablewski + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files(the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions : +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +-- About + +QOI encodes and decodes images in a lossless format. Compared to stb_image and +stb_image_write QOI offers 20x-50x faster encoding, 3x-4x faster decoding and +20% better compression. + + +-- Synopsis + +// Define `QOI_IMPLEMENTATION` in *one* C/C++ file before including this +// library to create the implementation. + +#define QOI_IMPLEMENTATION +#include "qoi.h" + +// Encode and store an RGBA buffer to the file system. The qoi_desc describes +// the input pixel data. +qoi_write("image_new.qoi", rgba_pixels, &(qoi_desc){ + .width = 1920, + .height = 1080, + .channels = 4, + .colorspace = QOI_SRGB +}); + +// Load and decode a QOI image from the file system into a 32bbp RGBA buffer. +// The qoi_desc struct will be filled with the width, height, number of channels +// and colorspace read from the file header. +qoi_desc desc; +void *rgba_pixels = qoi_read("image.qoi", &desc, 4); + + + +-- Documentation + +This library provides the following functions; +- qoi_read -- read and decode a QOI file +- qoi_decode -- decode the raw bytes of a QOI image from memory +- qoi_write -- encode and write a QOI file +- qoi_encode -- encode an rgba buffer into a QOI image in memory + +See the function declaration below for the signature and more information. + +If you don't want/need the qoi_read and qoi_write functions, you can define +QOI_NO_STDIO before including this library. + +This library uses malloc() and free(). To supply your own malloc implementation +you can define QOI_MALLOC and QOI_FREE before including this library. + +This library uses memset() to zero-initialize the index. To supply your own +implementation you can define QOI_ZEROARR before including this library. + + +-- Data Format + +A QOI file has a 14 byte header, followed by any number of data "chunks" and an +8-byte end marker. + +struct qoi_header_t { + char magic[4]; // magic bytes "qoif" + uint32_t width; // image width in pixels (BE) + uint32_t height; // image height in pixels (BE) + uint8_t channels; // 3 = RGB, 4 = RGBA, 1=GREY + uint8_t colorspace; // 0 = sRGB with linear alpha, 1 = all channels linear +}; + +Images are encoded row by row, left to right, top to bottom. The decoder and +encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous pixel value. An +image is complete when all pixels specified by width * height have been covered. + +COLOUR PIXEL FORMATS 3 or 4 bytes per pixel format + +Colour pixels are encoded as + - a run of the previous pixel + - an index into an array of previously seen pixels + - a difference to the previous pixel value in r,g,b + - full r,g,b or r,g,b,a values + +The color channels are assumed to not be premultiplied with the alpha channel +("un-premultiplied alpha"). + +A running array[64] (zero-initialized) of previously seen pixel values is +maintained by the encoder and decoder. Each pixel that is seen by the encoder +and decoder is put into this array at the position formed by a hash function of +the color value. In the encoder, if the pixel value at the index matches the +current pixel, this index position is written to the stream as QOI_OP_INDEX. +The hash function for the index is: + + index_position = (r * 3 + g * 5 + b * 7 + a * 11) % 64 + +Each chunk starts with a 2- or 8-bit tag, followed by a number of data bits. The +bit length of chunks is divisible by 8 - i.e. all chunks are byte aligned. All +values encoded in these data bits have the most significant bit on the left. + +The 8-bit tags have precedence over the 2-bit tags. A decoder must check for the +presence of an 8-bit tag first. + +The byte stream's end is marked with 7 0x00 bytes followed a single 0x01 byte. + + +The possible chunks are: + +.- QOI_OP_INDEX ----------. +| Byte[0] | +| 7 6 5 4 3 2 1 0 | +|-------+-----------------| +| 0 0 | index | +`-------------------------` +2-bit tag b00 +6-bit index into the color index array: 0..63 + +A valid encoder must not issue 2 or more consecutive QOI_OP_INDEX chunks to the +same index. QOI_OP_RUN should be used instead. + + +.- QOI_OP_DIFF -----------. +| Byte[0] | +| 7 6 5 4 3 2 1 0 | +|-------+-----+-----+-----| +| 0 1 | dr | dg | db | +`-------------------------` +2-bit tag b01 +2-bit red channel difference from the previous pixel between -2..1 +2-bit green channel difference from the previous pixel between -2..1 +2-bit blue channel difference from the previous pixel between -2..1 + +Values are stored as unsigned integers with a bias of 2. E.g. -2 is stored as +0 (b00). 1 is stored as 3 (b11). + +The difference to the current channel values are using a wraparound operation, +so "1 - 2" will result in 255, while "255 + 1" will result in 0. + +The alpha value remains unchanged from the previous pixel. + +.- QOI_OP_LUMA -------------------------------------. +| Byte[0] | Byte[1] | +| 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | +|-------+-----------------+-------------+-----------| +| 1 0 | green diff | dr - dg | db - dg | +`---------------------------------------------------` +2-bit tag b10 +6-bit green channel difference from the previous pixel -32..31 +4-bit red channel difference minus green channel difference -8..7 +4-bit blue channel difference minus green channel difference -8..7 + +The green channel is used to indicate the general direction of change and is +encoded in 6 bits. The red and blue channels (dr and db) base their diffs off +of the green channel difference and are encoded in 4 bits. I.e.: + dr_dg = (cur_px.r - prev_px.r) - (cur_px.g - prev_px.g) + db_dg = (cur_px.b - prev_px.b) - (cur_px.g - prev_px.g) + +The difference to the current channel values are using a wraparound operation, +so "10 - 13" will result in 253, while "250 + 7" will result in 1. + +Values are stored as unsigned integers with a bias of 32 for the green channel +and a bias of 8 for the red and blue channel. + +The alpha value remains unchanged from the previous pixel. + +.- QOI_OP_RUN ------------. +| Byte[0] | +| 7 6 5 4 3 2 1 0 | +|-------+-----------------| +| 1 1 | run | +`-------------------------` +2-bit tag b11 +6-bit run-length repeating the previous pixel: 1..62 + +Note that the run-lengths 63 and 64 +(b111110 and b111111) are illegal as they are occupied by the QOI_OP_RGB and +QOI_OP_RGBA tags. + +The run-length is stored with a bias of -1. + +.- QOI_OP_RGB ------------------------------------------. +| Byte[0] | Byte[1] | Byte[2] | Byte[3] | +| 7 6 5 4 3 2 1 0 | 7 .. 0 | 7 .. 0 | 7 .. 0 | +|-------------------------+---------+---------+---------| +| 1 1 1 1 1 1 1 0 | red | green | blue | +`-------------------------------------------------------` +8-bit tag b11111110 +8-bit red channel value +8-bit green channel value +8-bit blue channel value + +The alpha value remains unchanged from the previous pixel. + +.- QOI_OP_RGBA ---------------------------------------------------. +| Byte[0] | Byte[1] | Byte[2] | Byte[3] | Byte[4] | +| 7 6 5 4 3 2 1 0 | 7 .. 0 | 7 .. 0 | 7 .. 0 | 7 .. 0 | +|-------------------------+---------+---------+---------+---------| +| 1 1 1 1 1 1 1 1 | red | green | blue | alpha | +`-----------------------------------------------------------------` +8-bit tag b11111111 +8-bit red channel value +8-bit green channel value +8-bit blue channel value +8-bit alpha channel value + + +GREYSCALE PIXEL FORMAT 1 byte per pixel format + +Grey pixels are encoded as + - a run of the previous pixel value + - differences to the previous pixel value (two 4-bit differences per byte) + - full intensity values + +Each chunk starts with a 2-bit tag, followed by a number of bits. The +bit length of chunks is divisible by 8 - i.e. all chunks are byte aligned. All +values encoded in these data bits have the most significant bit on the left. + +The byte stream's end is marked with 7 0x00 bytes followed a single 0x01 byte. + + +The possible chunks are: + +.- QOI_OP1_2DIFF ---------.-------------------------- +| Byte[0] | Byte[1] | +| 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0 | +|-------+-----------------|------------+------------| +| 0 1 | diff byte count | diff0 | diff1 | ...... +`-------------------------`-------------------------` +2-bit tag b01 +6-bit difference byte count for the next N bytes (so 2 x N pixels): 1..64 +two 4-bit greyscale differences from the previous pixel, they are found in pairs and 2 fill bytes. There is always 2. + +Values are stored as unsigned integers with a bias of 8. E.g. -8 is stored as +0 (b00). + +The difference to the current channel values are using a wraparound operation, +so "1 - 2" will result in 255, while "255 + 1" will result in 0. + + +.- QOI_OP1_RAW ------------.------- +| Byte[0] | +| 7 6 5 4 3 2 1 0 | +|-------+-----------------|------- +| 1 0 | raw bytes | pixel value 1,2,3,etc +`-------------------------`------- +2-bit tag b10 +6-bit raw byte data for the next N pixels: 1..64 +8-bit raw data bytes of pixel intensity values + + +The raw byte length is stored with a bias of -1. +.- QOI_OP1_RUN ------------. +| Byte[0] | +| 7 6 5 4 3 2 1 0 | +|-------+-----------------| +| 1 1 | run | +`-------------------------` +2-bit tag b11 +6-bit run-length repeating the previous pixel value: 1..64 pixels + +The run-length is stored with a bias of -1. + +*/ + + +/* ----------------------------------------------------------------------------- +Header - Public functions */ + +#ifndef QOI_H +#define QOI_H + +#pragma warning(disable : 4996) + +#ifdef __cplusplus +extern "C" { +#endif + +/* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions. +It describes either the input format (for qoi_write and qoi_encode), or is +filled with the description read from the file header (for qoi_read and +qoi_decode). + +The colorspace in this qoi_desc is an enum where + 0 = sRGB, i.e. gamma scaled RGB channels and a linear alpha channel + 1 = all channels are linear +You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely +informative. It will be saved to the file header, but does not affect +how chunks are en-/decoded. */ + +#define QOI_SRGB 0 +#define QOI_LINEAR 1 + +typedef struct { + unsigned int width; + unsigned int height; + unsigned char channels; + unsigned char colorspace; +} qoi_desc; + +#ifndef QOI_NO_STDIO + +/* Encode raw RGB or RGBA pixels into a QOI image and write it to the file +system. The qoi_desc struct must be filled with the image width, height, +number of channels (3 = RGB, 4 = RGBA) and the colorspace. + +The function returns 0 on failure (invalid parameters, or fopen or malloc +failed) or the number of bytes written on success. */ + +int qoi_write(const char *filename, const void *data, const qoi_desc *desc); + + +/* Read and decode a QOI image from the file system. If channels is 0, the +number of channels from the file header is used. If channels is 3 or 4 the +output format will be forced into this number of channels. + +The function either returns NULL on failure (invalid data, or malloc or fopen +failed) or a pointer to the decoded pixels. On success, the qoi_desc struct +will be filled with the description from the file header. + +The returned pixel data should be free()d after use. */ + +void *qoi_read(const char *filename, qoi_desc *desc, int channels); + +#endif /* QOI_NO_STDIO */ + + +/* Encode raw RGB or RGBA pixels into a QOI image in memory. + +The function either returns NULL on failure (invalid parameters or malloc +failed) or a pointer to the encoded data on success. On success the out_len +is set to the size in bytes of the encoded data. + +The returned qoi data should be free()d after use. */ + +void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len); + + +/* Decode a QOI image from memory. + +The function either returns NULL on failure (invalid parameters or malloc +failed) or a pointer to the decoded pixels. On success, the qoi_desc struct +is filled with the description from the file header. + +The returned pixel data should be free()d after use. */ + +void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels); + + +#ifdef __cplusplus +} +#endif +#endif /* QOI_H */ + + +/* ----------------------------------------------------------------------------- +Implementation */ + +#ifdef QOI_IMPLEMENTATION +#include +#include + +#ifndef QOI_MALLOC + #define QOI_MALLOC(sz) malloc(sz) + #define QOI_FREE(p) free(p) +#endif +#ifndef QOI_ZEROARR + #define QOI_ZEROARR(a) memset((a),0,sizeof(a)) +#endif + +#define QOI_OP_INDEX 0x00 /* 00xxxxxx */ +#define QOI_OP_DIFF 0x40 /* 01xxxxxx */ +#define QOI_OP_LUMA 0x80 /* 10xxxxxx */ +#define QOI_OP_RUN 0xc0 /* 11xxxxxx */ +#define QOI_OP_RGB 0xfe /* 11111110 */ +#define QOI_OP_RGBA 0xff /* 11111111 */ + +#define QOI_MASK_2 0xc0 /* 11000000 */ +#define QOI_MASK_3 0xe0 /* 11100000 */ + +#define QOI_COLOR_HASH(C) (C.rgba.r*3 + C.rgba.g*5 + C.rgba.b*7 + C.rgba.a*11) + +#define QOI_MAGIC \ + (((unsigned int)'q') << 24 | ((unsigned int)'o') << 16 | \ + ((unsigned int)'i') << 8 | ((unsigned int)'f')) +#define QOI_HEADER_SIZE 14 + +/* 2GB is the max file size that this implementation can safely handle. We guard +against anything larger than that, assuming the worst case with 5 bytes per +pixel, rounded down to a nice clean value. 400 million pixels ought to be +enough for anybody. */ +#define QOI_PIXELS_MAX ((unsigned int)400000000) + +typedef union { + struct { unsigned char r, g, b, a; } rgba; + unsigned int v; +} qoi_rgba_t; + +static const unsigned char qoi_padding[8] = {0,0,0,0,0,0,0,1}; + +static void qoi_write_32(unsigned char *bytes, int *p, unsigned int v) { + bytes[(*p)++] = (0xff000000 & v) >> 24; + bytes[(*p)++] = (0x00ff0000 & v) >> 16; + bytes[(*p)++] = (0x0000ff00 & v) >> 8; + bytes[(*p)++] = (0x000000ff & v); +} + +static unsigned int qoi_read_32(const unsigned char *bytes, int *p) { + unsigned int a = bytes[(*p)++]; + unsigned int b = bytes[(*p)++]; + unsigned int c = bytes[(*p)++]; + unsigned int d = bytes[(*p)++]; + return a << 24 | b << 16 | c << 8 | d; +} + +//turn features on and off for testing, raw is always necessary +#define ENABLE_RUN +#define ENABLE_2DIFF +//define ENABLE_COUNTS //turn on keeping/printing of statistics + +#define QOI_OP1_2DIFF 0x40 /* 01xxxxxx 4bit difference*/ +#define QOI_OP1_RAW 0x80 /* 10xxxxxx raw bytes */ +#define QOI_OP1_RUN 0xc0 /* 11xxxxxx run length*/ + +void *qoi_grey_encode(const void *data, const qoi_desc *desc, int *out_len) { + + int i, max_size, p; + + unsigned char run; + + int px_len, px_end, px_pos, channels, px_stop; + + unsigned char *bytes; + + unsigned char mode, mode_next; + const unsigned char *pixels; + const unsigned char *thispixel; + + unsigned char px, px_prev; + unsigned char px1; + + signed char dv0, dv1; //deltas between pixels + + unsigned char *run_start=NULL; + +#ifdef ENABLE_COUNTS + unsigned long count_run = 0; + unsigned long count_raw = 0; + unsigned long count_2diff = 0; +#endif + + if ( + data == NULL || out_len == NULL || desc == NULL || + desc->width == 0 || desc->height == 0 || + desc->channels != 1 || + desc->colorspace > 1 || + desc->height >= QOI_PIXELS_MAX / desc->width + ) { + return NULL; + } + + max_size = + desc->width * desc->height * (desc->channels + 1) + + QOI_HEADER_SIZE + sizeof(qoi_padding); + + p = 0; + bytes = (unsigned char *) QOI_MALLOC(max_size); + if (!bytes) { + return NULL; + } + + qoi_write_32(bytes, &p, QOI_MAGIC); + qoi_write_32(bytes, &p, desc->width); + qoi_write_32(bytes, &p, desc->height); + bytes[p++] = desc->channels; + bytes[p++] = desc->colorspace; + + + pixels = (const unsigned char *)data; + + run = 0; + mode = 0; + px_prev = 0; + px = px_prev; + px1 = px; + + px_len = desc->width * desc->height * desc->channels; + px_end = px_len - desc->channels; //move back the to the beginning of the last pixel + px_stop = px_end - 2*(desc->channels); //this is the last pixel we can look at in the main loop + channels = desc->channels; + + //always do the first byte in raw + run_start = &(bytes[p]); + bytes[p++] = QOI_OP1_RAW; //we don't know how many bytes yet, but output the 2 type bits + bytes[p++] = *(pixels); + run = 1; +#ifdef ENABLE_COUNTS + count_raw++; +#endif + mode = QOI_OP1_RAW; + px_prev = *(pixels); + thispixel = pixels + 1; + + for (px_pos = 1; px_pos < px_stop; px_pos++) { + + //thispixel = pixels + px_pos; + + px = *(thispixel++); + px1 = *(thispixel); + +#ifdef ENABLE_RUN + if (px == px_prev) + { //same value - run length + mode_next = QOI_OP1_RUN; +#ifdef ENABLE_COUNTS + count_run++; +#endif + + } + else +#endif //ENABLE_RUN + { + dv0 = px - px_prev; //deltas between pixels + dv1 = px1 - px; + +#ifdef ENABLE_2DIFF + if (dv0 > -9 && dv0 < 8 && + dv1 > -9 && dv1 < 8 + ) + { + mode_next = QOI_OP1_2DIFF; +#ifdef ENABLE_COUNTS + count_2diff += 2; +#endif + } + else +#endif //ENABLE_2DIFF + { //default is RAW bytes + mode_next = QOI_OP1_RAW; +#ifdef ENABLE_COUNTS + count_raw++; +#endif + } + } + + if (mode_next != mode || run == 62 ) //we're switching to a different mode, close one mode and start the next + { + *run_start = (*run_start) | (run - 1); + run = 0; + + run_start = &(bytes[p]); + bytes[p++] = mode_next; //we don't know how many bytes yet, but output the 2 type bits + + mode = mode_next; + } + + run++;// we've always created one byte of output data + +#ifdef ENABLE_2DIFF + if (mode == QOI_OP1_2DIFF) + { + bytes[p++] = (dv0 + 8) << 4 | (dv1 + 8); + px_pos++; + thispixel++; + px_prev = px1; + } + else +#endif + if (mode == QOI_OP1_RAW) + { + bytes[p++] = px; + px_prev = px; + } + + } + + //close out the last one + *run_start = (*run_start) | (run - 1); + run=0; + + if (px_pos != px_end) //if we have any left over pixels, store them RAW + { + run_start = &(bytes[p]); + bytes[p++] = QOI_OP1_RAW; //we don't know how many bytes yet, but output the 2 type bits + + while (px_pos <= px_end) + { + bytes[p++] = *(thispixel++); + px_pos++; + run++; +#ifdef ENABLE_COUNTS + count_raw++; +#endif + + } + *run_start = (*run_start) | (run - 1); + run = 0; + } + + for (i = 0; i < (int)sizeof(qoi_padding); i++) { + bytes[p++] = qoi_padding[i]; + } + +#ifdef ENABLE_COUNTS + unsigned long count_total = count_run+ count_raw+ count_2diff; + double dcount_total = count_total; + + printf("encode counts: run %ld, raw %ld, 2diff %ld, total %ld, inlen %ld, outlen %ld\n", + count_run, count_raw, count_2diff, count_total, px_len,p); + printf("encode percent: run %.2f, raw %.2f, 2diff %.2f, out %.3f\n", + count_run/dcount_total, count_raw / dcount_total, count_2diff / dcount_total, p/(double)(px_len)); + +#endif + + *out_len = p; + return bytes; +} + +void *qoi_grey_decode(const void *data, int size, qoi_desc *desc, int channels) { + const unsigned char *bytes; + unsigned int header_magic; + unsigned char *pixels; + + //qoi_rgba_t index[64]; + unsigned char px; + int px_len, chunks_len, px_pos; + int p = 0, run = 0, raw=0; + + if ( + data == NULL || desc == NULL || + (channels != 0 && channels != 1) || + size < QOI_HEADER_SIZE + (int)sizeof(qoi_padding) + ) { + return NULL; + } + + bytes = (const unsigned char *)data; + + header_magic = qoi_read_32(bytes, &p); + desc->width = qoi_read_32(bytes, &p); + desc->height = qoi_read_32(bytes, &p); + desc->channels = bytes[p++]; + desc->colorspace = bytes[p++]; + + if ( + desc->width == 0 || desc->height == 0 || + //desc->channels < 3 || desc->channels > 4 || + desc->channels != 1 || + desc->colorspace > 1 || + header_magic != QOI_MAGIC || + desc->height >= QOI_PIXELS_MAX / desc->width + ) { + return NULL; + } + + if (channels == 0) { + channels = desc->channels; + } + + px_len = desc->width * desc->height * channels; + pixels = (unsigned char *) QOI_MALLOC(px_len); + if (!pixels) { + return NULL; + } + + px = 0; + + chunks_len = size - (int)sizeof(qoi_padding); + for (px_pos = 0; px_pos < px_len; px_pos++) + { + int b1 = bytes[p++]; + + if ((b1 & QOI_MASK_2) == QOI_OP1_2DIFF) { + run = (b1 & 0x3f); + while (run >= 0) + { + b1 = bytes[p++]; + px += ((b1 >> 4) & 0x0f) - 8; + + //dump a pixel here + if (px_pos < px_len) + { + *(pixels + px_pos) = px; + } + px_pos++; + + px += ((b1) & 0x0f) - 8; + + if (px_pos < px_len) + { + *(pixels + px_pos) = px; + } + px_pos++; + + run--; + } + px_pos--; + + } + else if ((b1 & QOI_MASK_2) == QOI_OP1_RUN) { + run = (b1 & 0x3f); + while (run >= 0) + { + if (px_pos < px_len) + { + *(pixels + px_pos) = px; + } + px_pos++; + run--; + } + px_pos--; + } + else if ((b1 & QOI_MASK_2) == QOI_OP1_RAW) { + raw = (b1 & 0x3f); + while (raw >= 0) + { + px = bytes[p++]; + if (px_pos < px_len) + { + *(pixels + px_pos) = px; + } + px_pos++; + raw--; + } + px_pos--; + } + } + + return pixels; +} + +void* qoi_colour_encode(const void* data, const qoi_desc* desc, int* out_len) { + int i, max_size, p, run; + int px_len, px_end, px_pos, channels; + unsigned char* bytes; + const unsigned char* pixels; + qoi_rgba_t index[64]; + qoi_rgba_t px, px_prev; + + if ( + data == NULL || out_len == NULL || desc == NULL || + desc->width == 0 || desc->height == 0 || + desc->channels < 3 || desc->channels > 4 || + desc->colorspace > 1 || + desc->height >= QOI_PIXELS_MAX / desc->width + ) { + return NULL; + } + + max_size = + desc->width * desc->height * (desc->channels + 1) + + QOI_HEADER_SIZE + sizeof(qoi_padding); + + p = 0; + bytes = (unsigned char*)QOI_MALLOC(max_size); + if (!bytes) { + return NULL; + } + + qoi_write_32(bytes, &p, QOI_MAGIC); + qoi_write_32(bytes, &p, desc->width); + qoi_write_32(bytes, &p, desc->height); + bytes[p++] = desc->channels; + bytes[p++] = desc->colorspace; + + + pixels = (const unsigned char*)data; + + QOI_ZEROARR(index); + + run = 0; + px_prev.rgba.r = 0; + px_prev.rgba.g = 0; + px_prev.rgba.b = 0; + px_prev.rgba.a = 255; + px = px_prev; + + px_len = desc->width * desc->height * desc->channels; + px_end = px_len - desc->channels; + channels = desc->channels; + + for (px_pos = 0; px_pos < px_len; px_pos += channels) { + if (channels == 4) { + px = *(qoi_rgba_t*)(pixels + px_pos); + } + else { + px.rgba.r = pixels[px_pos + 0]; + px.rgba.g = pixels[px_pos + 1]; + px.rgba.b = pixels[px_pos + 2]; + } + + if (px.v == px_prev.v) { + run++; + if (run == 62 || px_pos == px_end) { + bytes[p++] = QOI_OP_RUN | (run - 1); + run = 0; + } + } + else { + int index_pos; + + if (run > 0) { + bytes[p++] = QOI_OP_RUN | (run - 1); + run = 0; + } + + index_pos = QOI_COLOR_HASH(px) % 64; + + if (index[index_pos].v == px.v) { + bytes[p++] = QOI_OP_INDEX | index_pos; + } + else { + index[index_pos] = px; + + if (px.rgba.a == px_prev.rgba.a) { + signed char vr = px.rgba.r - px_prev.rgba.r; + signed char vg = px.rgba.g - px_prev.rgba.g; + signed char vb = px.rgba.b - px_prev.rgba.b; + + signed char vg_r = vr - vg; + signed char vg_b = vb - vg; + + if ( + vr > -3 && vr < 2 && + vg > -3 && vg < 2 && + vb > -3 && vb < 2 + ) { + bytes[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2); + } + else if ( + vg_r > -9 && vg_r < 8 && + vg > -33 && vg < 32 && + vg_b > -9 && vg_b < 8 + ) { + bytes[p++] = QOI_OP_LUMA | (vg + 32); + bytes[p++] = (vg_r + 8) << 4 | (vg_b + 8); + } + else { + bytes[p++] = QOI_OP_RGB; + bytes[p++] = px.rgba.r; + bytes[p++] = px.rgba.g; + bytes[p++] = px.rgba.b; + } + } + else { + bytes[p++] = QOI_OP_RGBA; + bytes[p++] = px.rgba.r; + bytes[p++] = px.rgba.g; + bytes[p++] = px.rgba.b; + bytes[p++] = px.rgba.a; + } + } + } + px_prev = px; + } + + for (i = 0; i < (int)sizeof(qoi_padding); i++) { + bytes[p++] = qoi_padding[i]; + } + + *out_len = p; + return bytes; +} + +void* qoi_colour_decode(const void* data, int size, qoi_desc* desc, int channels) { + const unsigned char* bytes; + unsigned int header_magic; + unsigned char* pixels; + qoi_rgba_t index[64]; + qoi_rgba_t px; + int px_len, chunks_len, px_pos; + int p = 0, run = 0; + + if ( + data == NULL || desc == NULL || + (channels != 0 && channels != 3 && channels != 4) || + size < QOI_HEADER_SIZE + (int)sizeof(qoi_padding) + ) { + return NULL; + } + + bytes = (const unsigned char*)data; + + header_magic = qoi_read_32(bytes, &p); + desc->width = qoi_read_32(bytes, &p); + desc->height = qoi_read_32(bytes, &p); + desc->channels = bytes[p++]; + desc->colorspace = bytes[p++]; + + if ( + desc->width == 0 || desc->height == 0 || + desc->channels < 3 || desc->channels > 4 || + desc->colorspace > 1 || + header_magic != QOI_MAGIC || + desc->height >= QOI_PIXELS_MAX / desc->width + ) { + return NULL; + } + + if (channels == 0) { + channels = desc->channels; + } + + px_len = desc->width * desc->height * channels; + pixels = (unsigned char*)QOI_MALLOC(px_len); + if (!pixels) { + return NULL; + } + + QOI_ZEROARR(index); + px.rgba.r = 0; + px.rgba.g = 0; + px.rgba.b = 0; + px.rgba.a = 255; + + chunks_len = size - (int)sizeof(qoi_padding); + for (px_pos = 0; px_pos < px_len; px_pos += channels) { + if (run > 0) { + run--; + } + else if (p < chunks_len) { + int b1 = bytes[p++]; + + if (b1 == QOI_OP_RGB) { + px.rgba.r = bytes[p++]; + px.rgba.g = bytes[p++]; + px.rgba.b = bytes[p++]; + } + else if (b1 == QOI_OP_RGBA) { + px.rgba.r = bytes[p++]; + px.rgba.g = bytes[p++]; + px.rgba.b = bytes[p++]; + px.rgba.a = bytes[p++]; + } + else if ((b1 & QOI_MASK_2) == QOI_OP_INDEX) { + px = index[b1]; + } + else if ((b1 & QOI_MASK_2) == QOI_OP_DIFF) { + px.rgba.r += ((b1 >> 4) & 0x03) - 2; + px.rgba.g += ((b1 >> 2) & 0x03) - 2; + px.rgba.b += (b1 & 0x03) - 2; + } + else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA) { + int b2 = bytes[p++]; + int vg = (b1 & 0x3f) - 32; + px.rgba.r += vg - 8 + ((b2 >> 4) & 0x0f); + px.rgba.g += vg; + px.rgba.b += vg - 8 + (b2 & 0x0f); + } + else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) { + run = (b1 & 0x3f); + } + + index[QOI_COLOR_HASH(px) % 64] = px; + } + + if (channels == 4) { + *(qoi_rgba_t*)(pixels + px_pos) = px; + } + else { + pixels[px_pos + 0] = px.rgba.r; + pixels[px_pos + 1] = px.rgba.g; + pixels[px_pos + 2] = px.rgba.b; + } + } + + return pixels; +} + + +void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len) { + + if(desc->channels == 3 || desc->channels == 4) + { + return qoi_colour_encode(data, desc, out_len); + } + else if (desc->channels == 1) + { + return qoi_grey_encode(data, desc, out_len); + } +} + +void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels) + { + + void *retptr=NULL; + + //try it as colour + retptr=qoi_colour_decode(data, size, desc, channels); + + //if not try it as grey + if(retptr==NULL) + { + retptr=qoi_grey_decode(data, size, desc, channels); + } + + return retptr; +} + + +#ifndef QOI_NO_STDIO +#include + +int qoi_write(const char *filename, const void *data, const qoi_desc *desc) { + FILE *f = fopen(filename, "wb"); + int size; + void *encoded; + + if (!f) { + return 0; + } + + encoded = qoi_encode(data, desc, &size); + if (!encoded) { + fclose(f); + return 0; + } + + fwrite(encoded, 1, size, f); + fclose(f); + + QOI_FREE(encoded); + return size; +} + +void *qoi_read(const char *filename, qoi_desc *desc, int channels) { + FILE *f = fopen(filename, "rb"); + int size, bytes_read; + void *pixels, *data; + + if (!f) { + return NULL; + } + + fseek(f, 0, SEEK_END); + size = ftell(f); + if (size <= 0) { + fclose(f); + return NULL; + } + fseek(f, 0, SEEK_SET); + + data = QOI_MALLOC(size); + if (!data) { + fclose(f); + return NULL; + } + + bytes_read = fread(data, 1, size, f); + fclose(f); + + pixels = qoi_decode(data, bytes_read, desc, channels); + QOI_FREE(data); + return pixels; +} + +#endif /* QOI_NO_STDIO */ +#endif /* QOI_IMPLEMENTATION */ diff --git a/ouster_osf/src/zpng.cpp b/ouster_osf/src/zpng.cpp new file mode 100644 index 00000000..3f97e120 --- /dev/null +++ b/ouster_osf/src/zpng.cpp @@ -0,0 +1,581 @@ +/** \file + \brief Zpng - Experimental Lossless Image Compressor + \copyright Copyright (c) 2018 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Zpng nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "zpng.h" + +#include + +#include // calloc + + +//------------------------------------------------------------------------------ +// Constants + +// Higher compression levels do not gain much but hurt speed +static const int kCompressionLevel = 1; + +// This enabled some specialized versions for RGB and RGBA +#define ENABLE_RGB_COLOR_FILTER + +// Header definitions +#define ZPNG_HEADER_MAGIC 0xFBF8 +#define ZPNG_HEADER_OVERHEAD_BYTES 8 + +// File format header +struct ZPNG_Header +{ + uint16_t Magic; + uint16_t Width; + uint16_t Height; + uint8_t Channels; + uint8_t BytesPerChannel; +}; + + +//------------------------------------------------------------------------------ +// Image Processing + +// Interleaving is a 1% compression win, and a 0.3% performance win: Not used. +// Splitting the data into blocks of 4 at a time actually reduces compression. + +template +static void PackAndFilter( + const ZPNG_ImageData* imageData, + uint8_t* output +) +{ + const unsigned height = imageData->HeightPixels; + const unsigned width = imageData->WidthPixels; + + const uint8_t* input = imageData->Buffer.Data; + + for (unsigned y = 0; y < height; ++y) + { + uint8_t prev[kChannels] = { 0 }; + + for (unsigned x = 0; x < width; ++x) + { + // For each channel: + for (unsigned i = 0; i < kChannels; ++i) + { + uint8_t a = input[i]; + uint8_t d = a - prev[i]; + output[i] = d; + prev[i] = a; + } + + input += kChannels; + output += kChannels; + } + } +} + +template +static void UnpackAndUnfilter( + const uint8_t* input, + ZPNG_ImageData* imageData +) +{ + const unsigned height = imageData->HeightPixels; + const unsigned width = imageData->WidthPixels; + + uint8_t* output = imageData->Buffer.Data; + + for (unsigned y = 0; y < height; ++y) + { + uint8_t prev[kChannels] = { 0 }; + + for (unsigned x = 0; x < width; ++x) + { + // For each channel: + for (unsigned i = 0; i < kChannels; ++i) + { + uint8_t d = input[i]; + uint8_t a = d + prev[i]; + output[i] = a; + prev[i] = a; + } + + input += kChannels; + output += kChannels; + } + } +} + + +#ifdef ENABLE_RGB_COLOR_FILTER + +template<> +void PackAndFilter<3>( + const ZPNG_ImageData* imageData, + uint8_t* output + ) +{ + static const unsigned kChannels = 3; + + const unsigned height = imageData->HeightPixels; + const unsigned width = imageData->WidthPixels; + + const uint8_t* input = imageData->Buffer.Data; + + // Color plane split + const unsigned planeBytes = width * height; + uint8_t* output_y = output; + uint8_t* output_u = output + planeBytes; + uint8_t* output_v = output + planeBytes * 2; + + for (unsigned row = 0; row < height; ++row) + { + uint8_t prev[kChannels] = { 0 }; + + for (unsigned x = 0; x < width; ++x) + { + uint8_t r = input[0]; + uint8_t g = input[1]; + uint8_t b = input[2]; + + r -= prev[0]; + g -= prev[1]; + b -= prev[2]; + + prev[0] = input[0]; + prev[1] = input[1]; + prev[2] = input[2]; + + // GB-RG filter from BCIF + uint8_t y = b; + uint8_t u = g - b; + uint8_t v = g - r; + + *output_y++ = y; + *output_u++ = u; + *output_v++ = v; + + input += kChannels; + } + } +} + +template<> +void UnpackAndUnfilter<3>( + const uint8_t* input, + ZPNG_ImageData* imageData + ) +{ + static const unsigned kChannels = 3; + + const unsigned height = imageData->HeightPixels; + const unsigned width = imageData->WidthPixels; + + uint8_t* output = imageData->Buffer.Data; + + // Color plane split + const unsigned planeBytes = width * height; + const uint8_t* input_y = input; + const uint8_t* input_u = input + planeBytes; + const uint8_t* input_v = input + planeBytes * 2; + + for (unsigned row = 0; row < height; ++row) + { + uint8_t prev[kChannels] = { 0 }; + + for (unsigned x = 0; x < width; ++x) + { + uint8_t y = *input_y++; + uint8_t u = *input_u++; + uint8_t v = *input_v++; + + // GB-RG filter from BCIF + const uint8_t B = y; + const uint8_t G = u + B; + uint8_t r = G - v; + uint8_t g = G; + uint8_t b = B; + + r += prev[0]; + g += prev[1]; + b += prev[2]; + + output[0] = r; + output[1] = g; + output[2] = b; + + prev[0] = r; + prev[1] = g; + prev[2] = b; + + output += kChannels; + } + } +} + + +// Version for RGBA (with alpha): + +template<> +void PackAndFilter<4>( + const ZPNG_ImageData* imageData, + uint8_t* output + ) +{ + static const unsigned kChannels = 4; + + const unsigned height = imageData->HeightPixels; + const unsigned width = imageData->WidthPixels; + + const uint8_t* input = imageData->Buffer.Data; + + // Color plane split + const unsigned planeBytes = width * height; + uint8_t* output_y = output; + uint8_t* output_u = output + planeBytes; + uint8_t* output_v = output + planeBytes * 2; + uint8_t* output_a = output + planeBytes * 3; + + for (unsigned row = 0; row < height; ++row) + { + uint8_t prev[kChannels] = { 0 }; + + for (unsigned x = 0; x < width; ++x) + { + uint8_t r = input[0]; + uint8_t g = input[1]; + uint8_t b = input[2]; + uint8_t a = input[3]; + + r -= prev[0]; + g -= prev[1]; + b -= prev[2]; + a -= prev[3]; + + prev[0] = input[0]; + prev[1] = input[1]; + prev[2] = input[2]; + prev[3] = input[3]; + + // GB-RG filter from BCIF + uint8_t y = b; + uint8_t u = g - b; + uint8_t v = g - r; + + *output_y++ = y; + *output_u++ = u; + *output_v++ = v; + *output_a++ = a; + + input += kChannels; + } + } +} + +template<> +void UnpackAndUnfilter<4>( + const uint8_t* input, + ZPNG_ImageData* imageData + ) +{ + static const unsigned kChannels = 4; + + const unsigned height = imageData->HeightPixels; + const unsigned width = imageData->WidthPixels; + + uint8_t* output = imageData->Buffer.Data; + + // Color plane split + const unsigned planeBytes = width * height; + const uint8_t* input_y = input; + const uint8_t* input_u = input + planeBytes; + const uint8_t* input_v = input + planeBytes * 2; + const uint8_t* input_a = input + planeBytes * 3; + + for (unsigned row = 0; row < height; ++row) + { + uint8_t prev[kChannels] = { 0 }; + + for (unsigned x = 0; x < width; ++x) + { + uint8_t y = *input_y++; + uint8_t u = *input_u++; + uint8_t v = *input_v++; + uint8_t a = *input_a++; + + // GB-RG filter from BCIF + const uint8_t B = y; + const uint8_t G = u + B; + uint8_t r = G - v; + uint8_t g = G; + uint8_t b = B; + + r += prev[0]; + g += prev[1]; + b += prev[2]; + a += prev[3]; + + output[0] = r; + output[1] = g; + output[2] = b; + output[3] = a; + + prev[0] = r; + prev[1] = g; + prev[2] = b; + prev[3] = a; + + output += kChannels; + } + } +} + +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +//------------------------------------------------------------------------------ +// API + +ZPNG_Buffer ZPNG_Compress( + const ZPNG_ImageData* imageData +) +{ + uint8_t* packing = nullptr; + uint8_t* output = nullptr; + + ZPNG_Buffer bufferOutput; + bufferOutput.Data = nullptr; + bufferOutput.Bytes = 0; + + const unsigned pixelCount = imageData->WidthPixels * imageData->HeightPixels; + const unsigned pixelBytes = imageData->BytesPerChannel * imageData->Channels; + const unsigned byteCount = pixelBytes * pixelCount; + + // FIXME: One day add support for other formats + if (pixelBytes > 8) { + return bufferOutput; + } + + // Space for packing + packing = (uint8_t*)calloc(1, byteCount); + + if (!packing) { +ReturnResult: + if (bufferOutput.Data != output) { + free(output); + } + free(packing); + return bufferOutput; + } + + const unsigned maxOutputBytes = (unsigned)ZSTD_compressBound(byteCount); + + // Space for output + output = (uint8_t*)calloc(1, ZPNG_HEADER_OVERHEAD_BYTES + maxOutputBytes); + + if (!output) { + goto ReturnResult; + } + + // Pass 1: Pack and filter data. + + switch (pixelBytes) + { + case 1: + PackAndFilter<1>(imageData, packing); + break; + case 2: + PackAndFilter<2>(imageData, packing); + break; + case 3: + PackAndFilter<3>(imageData, packing); + break; + case 4: + PackAndFilter<4>(imageData, packing); + break; + case 5: + PackAndFilter<5>(imageData, packing); + break; + case 6: + PackAndFilter<6>(imageData, packing); + break; + case 7: + PackAndFilter<7>(imageData, packing); + break; + case 8: + PackAndFilter<8>(imageData, packing); + break; + } + + // Pass 2: Compress the packed/filtered data. + + const size_t result = ZSTD_compress( + output + ZPNG_HEADER_OVERHEAD_BYTES, + maxOutputBytes, + packing, + byteCount, + kCompressionLevel); + + if (ZSTD_isError(result)) { + goto ReturnResult; + } + + // Write header + + ZPNG_Header* header = (ZPNG_Header*)output; + header->Magic = ZPNG_HEADER_MAGIC; + header->Width = (uint16_t)imageData->WidthPixels; + header->Height = (uint16_t)imageData->HeightPixels; + header->Channels = (uint8_t)imageData->Channels; + header->BytesPerChannel = (uint8_t)imageData->BytesPerChannel; + + bufferOutput.Data = output; + bufferOutput.Bytes = ZPNG_HEADER_OVERHEAD_BYTES + (unsigned)result; + + goto ReturnResult; +} + +ZPNG_ImageData ZPNG_Decompress( + ZPNG_Buffer buffer +) +{ + uint8_t* packing = nullptr; + uint8_t* output = nullptr; + + ZPNG_ImageData imageData; + imageData.Buffer.Data = nullptr; + imageData.Buffer.Bytes = 0; + imageData.BytesPerChannel = 0; + imageData.Channels = 0; + imageData.HeightPixels = 0; + imageData.StrideBytes = 0; + imageData.WidthPixels = 0; + + if (!buffer.Data || buffer.Bytes < ZPNG_HEADER_OVERHEAD_BYTES) { +ReturnResult: + if (imageData.Buffer.Data != output) { + free(output); + } + free(packing); + return imageData; + } + + const ZPNG_Header* header = (const ZPNG_Header*)buffer.Data; + if (header->Magic != ZPNG_HEADER_MAGIC) { + goto ReturnResult; + } + + imageData.WidthPixels = header->Width; + imageData.HeightPixels = header->Height; + imageData.Channels = header->Channels; + imageData.BytesPerChannel = header->BytesPerChannel; + imageData.StrideBytes = imageData.WidthPixels * imageData.Channels; + + const unsigned pixelCount = imageData.WidthPixels * imageData.HeightPixels; + const unsigned pixelBytes = imageData.BytesPerChannel * imageData.Channels; + const unsigned byteCount = pixelBytes * pixelCount; + + // Space for packing + packing = (uint8_t*)calloc(1, byteCount); + + if (!packing) { + goto ReturnResult; + } + + // Stage 1: Decompress back to packing buffer + + const size_t result = ZSTD_decompress( + packing, + byteCount, + buffer.Data + ZPNG_HEADER_OVERHEAD_BYTES, + buffer.Bytes - ZPNG_HEADER_OVERHEAD_BYTES); + + if (ZSTD_isError(result)) { + goto ReturnResult; + } + + // Stage 2: Unpack/Unfilter + + // Space for output + output = (uint8_t*)calloc(1, byteCount); + + if (!output) { + goto ReturnResult; + } + + imageData.Buffer.Data = output; + imageData.Buffer.Bytes = byteCount; + + switch (pixelBytes) + { + case 1: + UnpackAndUnfilter<1>(packing, &imageData); + break; + case 2: + UnpackAndUnfilter<2>(packing, &imageData); + break; + case 3: + UnpackAndUnfilter<3>(packing, &imageData); + break; + case 4: + UnpackAndUnfilter<4>(packing, &imageData); + break; + case 5: + UnpackAndUnfilter<5>(packing, &imageData); + break; + case 6: + UnpackAndUnfilter<6>(packing, &imageData); + break; + case 7: + UnpackAndUnfilter<7>(packing, &imageData); + break; + case 8: + UnpackAndUnfilter<8>(packing, &imageData); + break; + } + + goto ReturnResult; +} + +void ZPNG_Free( + ZPNG_Buffer* buffer +) +{ + if (buffer && buffer->Data) + { + free(buffer->Data); + buffer->Data = nullptr; + buffer->Bytes = 0; + } +} + + +#ifdef __cplusplus +} +#endif diff --git a/ouster_osf/src/zpng.h b/ouster_osf/src/zpng.h new file mode 100644 index 00000000..a0d3a086 --- /dev/null +++ b/ouster_osf/src/zpng.h @@ -0,0 +1,141 @@ +/** \file + \brief Zpng - Experimental Lossless Image Compressor + \copyright Copyright (c) 2018 Christopher A. Taylor. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Zpng nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef CAT_ZPNG_H +#define CAT_ZPNG_H + +/** \page Zpng - Experimental Lossless Image Compressor + + It's much faster than PNG and compresses better for photographic images. + This compressor often takes less than 6% of the time of a PNG compressor + and produces a file that is 66% of the size. + It was written in about 600 lines of C code thanks to the Zstd library. + + This library is similar to PNG in that the image is first filtered, + and then submitted to a data compressor. + The filtering step is a bit simpler and faster but somehow more effective + than the one used in PNG. + The data compressor used is Zstd, which makes it significantly faster + than PNG to compress and decompress. + + Filtering: + + (1) Reversible color channel transformation. + (2) Split each color channel into a separate color plane. + (3) Subtract each color value from the one to its left. +*/ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Buffer returned by library +struct ZPNG_Buffer +{ + // Pointer to data + unsigned char* Data; + + // Size of buffer in bytes + unsigned Bytes; +}; + +// Image data returned by the library +struct ZPNG_ImageData +{ + // Pixel data + ZPNG_Buffer Buffer; + + // Number of bytes for each color channel (1-2) + unsigned BytesPerChannel; + + // Number of channels for each pixel (1-4) + unsigned Channels; + + // Width in pixels of image + unsigned WidthPixels; + + // Height in pixels of image + unsigned HeightPixels; + + // Width of pixel row in bytes + unsigned StrideBytes; +}; + + +//------------------------------------------------------------------------------ +// API + +/** + ZPNG_Compress() + + Compress image into a buffer. + + The returned buffer should be passed to ZPNG_Free(). + + On success returns a valid data pointer. + On failure returns a null pointer. +*/ +ZPNG_Buffer ZPNG_Compress( + const ZPNG_ImageData* imageData +); + +/* + ZPNG_Decompress() + + Decompress image from a buffer. + + The returned ZPNG_Buffer should be passed to ZPNG_Free(). + + On success returns a valid data pointer. + On failure returns a null pointer. +*/ +ZPNG_ImageData ZPNG_Decompress( + ZPNG_Buffer buffer +); + +/* + ZPNG_Free() + + Free buffer when done to avoid leaks. + + This will also set the buffer data pointer to null. +*/ +void ZPNG_Free( + ZPNG_Buffer* buffer +); + + +#ifdef __cplusplus +} +#endif + + +#endif // CAT_ZPNG_H diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 3098cbd9..084fd6ec 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -36,6 +36,7 @@ if(NOT SKIP_SDK_FIND) find_package(OusterSDK REQUIRED) endif() +add_compile_options(-march=native) # when building as a top-level project if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) message(STATUS "Ouster SDK client: Using EIGEN_MAX_ALIGN_BYTES = 32")