diff --git a/CMakeLists.txt b/CMakeLists.txt
index f236231d..aae312d7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -48,12 +48,13 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
     add_compile_options(/W2 /bigobj /Zf)
     add_compile_definitions(NOMINMAX _USE_MATH_DEFINES WIN32_LEAN_AND_MEAN)
   else()
-    add_compile_options(-Wall -Wextra)
+    add_compile_options(-Wall -Wextra -march=native)
   endif()
 
   include(CTest)
 else()
   set(OUSTER_TOP_LEVEL OFF)
+  add_compile_options(-Wall -Wextra -march=native)
 endif()
 
 # === Subdirectories ===
diff --git a/ouster_osf/CMakeLists.txt b/ouster_osf/CMakeLists.txt
index 9366ef30..0078845c 100644
--- a/ouster_osf/CMakeLists.txt
+++ b/ouster_osf/CMakeLists.txt
@@ -93,6 +93,7 @@ add_library(ouster_osf STATIC src/compat_ops.cpp
                               src/fb_utils.cpp
                               src/writer.cpp
                               src/async_writer.cpp
+                              src/zpng.cpp
                               src/png_lidarscan_encoder.cpp
 )
 set_property(TARGET ouster_osf PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/ouster_osf/src/fpnge.cc b/ouster_osf/src/fpnge.cc
new file mode 100644
index 00000000..c8e3be0b
--- /dev/null
+++ b/ouster_osf/src/fpnge.cc
@@ -0,0 +1,1714 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fpnge.h"
+#include <algorithm>
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define FORCE_INLINE_LAMBDA
+#define FORCE_INLINE __forceinline
+#define __SSE4_1__ 1
+#define __PCLMUL__ 1
+#ifdef __AVX2__
+#define __BMI2__ 1
+#endif
+#else
+#define FORCE_INLINE_LAMBDA __attribute__((always_inline))
+#define FORCE_INLINE __attribute__((always_inline)) inline
+#endif
+
+#if defined(__x86_64__) || defined(__amd64__) || defined(__LP64) ||            \
+    defined(_M_X64) || defined(_M_AMD64) ||                                    \
+    (defined(_WIN64) && !defined(_M_ARM64))
+#define PLATFORM_AMD64 1
+#endif
+
+#if !defined(FPNGE_USE_PEXT)
+#if defined(__BMI2__) && defined(PLATFORM_AMD64) &&                            \
+    !defined(__tune_znver1__) && !defined(__tune_znver2__) &&                  \
+    !defined(__tune_bdver4__)
+#define FPNGE_USE_PEXT 1
+#else
+#define FPNGE_USE_PEXT 0
+#endif
+#endif
+
+#ifdef __AVX2__
+#include <immintrin.h>
+#define MM(f) _mm256_##f
+#define MMSI(f) _mm256_##f##_si256
+#define MIVEC __m256i
+#define BCAST128 _mm256_broadcastsi128_si256
+// workaround for compilers not supporting _mm256_zextsi128_si256
+#if (defined(__clang__) && __clang_major__ >= 5 &&                             \
+     (!defined(__APPLE__) || __clang_major__ >= 7)) ||                         \
+    (defined(__GNUC__) && __GNUC__ >= 10) ||                                   \
+    (defined(_MSC_VER) && _MSC_VER >= 1910)
+#define INT2VEC(v) _mm256_zextsi128_si256(_mm_cvtsi32_si128(v))
+#elif defined(__OPTIMIZE__)
+// technically incorrect, but should work fine most of the time
+#define INT2VEC(v) _mm256_castsi128_si256(_mm_cvtsi32_si128(v))
+#else
+// _mm256_insert_epi32 is unavailable on MSVC 19.0, so prefer the following
+#define INT2VEC(v)                                                             \
+  _mm256_inserti128_si256(_mm256_setzero_si256(), _mm_cvtsi32_si128(v), 0);
+#endif
+#define SIMD_WIDTH 32
+#define SIMD_MASK 0xffffffffU
+#elif defined(__SSE4_1__)
+#include <nmmintrin.h>
+#define MM(f) _mm_##f
+#define MMSI(f) _mm_##f##_si128
+#define MIVEC __m128i
+#define BCAST128(v) (v)
+#define INT2VEC _mm_cvtsi32_si128
+#define SIMD_WIDTH 16
+#define SIMD_MASK 0xffffU
+#else
+#error Requires SSE4.1 support minium
+#endif
+
+namespace {
+
+alignas(16) constexpr uint8_t kBitReverseNibbleLookup[16] = {
+    0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
+    0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
+};
+
+static constexpr uint8_t kLZ77NBits[29] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+                                           1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
+                                           4, 4, 4, 4, 5, 5, 5, 5, 0};
+
+static constexpr uint16_t kLZ77Base[29] = {
+    3,  4,  5,  6,  7,  8,  9,  10, 11,  13,  15,  17,  19,  23, 27,
+    31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258};
+
+static uint16_t BitReverse(size_t nbits, uint16_t bits) {
+  uint16_t rev16 = (kBitReverseNibbleLookup[bits & 0xF] << 12) |
+                   (kBitReverseNibbleLookup[(bits >> 4) & 0xF] << 8) |
+                   (kBitReverseNibbleLookup[(bits >> 8) & 0xF] << 4) |
+                   (kBitReverseNibbleLookup[bits >> 12]);
+  return rev16 >> (16 - nbits);
+}
+
+struct HuffmanTable {
+  uint8_t nbits[286];
+  uint16_t end_bits;
+
+  alignas(16) uint8_t approx_nbits[16];
+
+  alignas(16) uint8_t first16_nbits[16];
+  alignas(16) uint8_t first16_bits[16];
+
+  alignas(16) uint8_t last16_nbits[16];
+  alignas(16) uint8_t last16_bits[16];
+
+  alignas(16) uint8_t mid_lowbits[16];
+  uint8_t mid_nbits;
+
+  uint32_t lz77_length_nbits[259] = {};
+  uint32_t lz77_length_bits[259] = {};
+  uint32_t lz77_length_sym[259] = {};
+
+  uint32_t dist_nbits, dist_bits;
+
+  // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
+  // max_limit[i], so to minimize sum(nbits[i] * freqs[i]).
+  static void ComputeCodeLengths(const uint64_t *freqs, size_t n,
+                                 uint8_t *min_limit, uint8_t *max_limit,
+                                 uint8_t *nbits) {
+    size_t precision = 0;
+    uint64_t freqsum = 0;
+    for (size_t i = 0; i < n; i++) {
+      assert(freqs[i] != 0);
+      freqsum += freqs[i];
+      if (min_limit[i] < 1)
+        min_limit[i] = 1;
+      assert(min_limit[i] <= max_limit[i]);
+      precision = std::max<size_t>(max_limit[i], precision);
+    }
+    uint64_t infty = freqsum * precision;
+    std::vector<uint64_t> dynp(((1U << precision) + 1) * (n + 1), infty);
+    auto d = [&](size_t sym, size_t off) -> uint64_t & {
+      return dynp[sym * ((1 << precision) + 1) + off];
+    };
+    d(0, 0) = 0;
+    for (size_t sym = 0; sym < n; sym++) {
+      for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
+        size_t off_delta = 1U << (precision - bits);
+        for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
+          d(sym + 1, off + off_delta) = std::min(
+              d(sym, off) + freqs[sym] * bits, d(sym + 1, off + off_delta));
+        }
+      }
+    }
+
+    size_t sym = n;
+    size_t off = 1U << precision;
+
+    while (sym-- > 0) {
+      assert(off > 0);
+      for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
+        size_t off_delta = 1U << (precision - bits);
+        if (off_delta <= off &&
+            d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
+          off -= off_delta;
+          nbits[sym] = bits;
+          break;
+        }
+      }
+    }
+  }
+
+  void ComputeNBits(const uint64_t *collected_data) {
+    constexpr uint64_t kBaselineData[286] = {
+        113, 54, 28, 18, 12, 9, 7, 6, 5, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 5, 6, 7, 9,
+        12,  18, 29, 54, 1,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1,   1,  1,  1,  1,  1, 1, 1, 1, 1, 1, 1, 1,
+    };
+
+    uint64_t data[286];
+
+    for (size_t i = 0; i < 286; i++) {
+      data[i] = collected_data[i] + kBaselineData[i];
+    }
+
+    // Compute Huffman code length ensuring that all the "fake" symbols for [16,
+    // 240) and [255, 285) have their maximum length.
+    uint64_t collapsed_data[16 + 14 + 16 + 2] = {};
+    uint8_t collapsed_min_limit[16 + 14 + 16 + 2] = {};
+    uint8_t collapsed_max_limit[16 + 14 + 16 + 2];
+    for (size_t i = 0; i < 48; i++) {
+      collapsed_max_limit[i] = 8;
+    }
+    for (size_t i = 0; i < 16; i++) {
+      collapsed_data[i] = data[i];
+    }
+    for (size_t i = 0; i < 14; i++) {
+      collapsed_data[16 + i] = 1;
+      collapsed_min_limit[16 + i] = 8;
+    }
+    for (size_t j = 0; j < 16; j++) {
+      collapsed_data[16 + 14 + j] += data[240 + j];
+    }
+    collapsed_data[16 + 14 + 16] = 1;
+    collapsed_min_limit[16 + 14 + 16] = 8;
+    collapsed_data[16 + 14 + 16 + 1] = data[285];
+
+    uint8_t collapsed_nbits[48] = {};
+    ComputeCodeLengths(collapsed_data, 48, collapsed_min_limit,
+                       collapsed_max_limit, collapsed_nbits);
+
+    // Compute "extra" code lengths for symbols >= 256, except 285.
+    uint8_t tail_nbits[29] = {};
+    uint8_t tail_min_limit[29] = {};
+    uint8_t tail_max_limit[29] = {};
+    for (size_t i = 0; i < 29; i++) {
+      tail_min_limit[i] = 4;
+      tail_max_limit[i] = 7;
+    }
+    ComputeCodeLengths(data + 256, 29, tail_min_limit, tail_max_limit,
+                       tail_nbits);
+
+    for (size_t i = 0; i < 16; i++) {
+      nbits[i] = collapsed_nbits[i];
+    }
+    for (size_t i = 0; i < 14; i++) {
+      for (size_t j = 0; j < 16; j++) {
+        nbits[(i + 1) * 16 + j] = collapsed_nbits[16 + i] + 4;
+      }
+    }
+    for (size_t i = 0; i < 16; i++) {
+      nbits[240 + i] = collapsed_nbits[30 + i];
+    }
+    for (size_t i = 0; i < 29; i++) {
+      nbits[256 + i] = collapsed_nbits[46] + tail_nbits[i];
+    }
+    nbits[285] = collapsed_nbits[47];
+  }
+
+  void ComputeCanonicalCode(const uint8_t *nbits, uint16_t *bits) {
+    uint8_t code_length_counts[16] = {};
+    for (size_t i = 0; i < 286; i++) {
+      code_length_counts[nbits[i]]++;
+    }
+    uint16_t next_code[16] = {};
+    uint16_t code = 0;
+    for (size_t i = 1; i < 16; i++) {
+      code = (code + code_length_counts[i - 1]) << 1;
+      next_code[i] = code;
+    }
+    for (size_t i = 0; i < 286; i++) {
+      bits[i] = BitReverse(nbits[i], next_code[nbits[i]]++);
+    }
+  }
+
+  void FillNBits() {
+    for (size_t i = 0; i < 16; i++) {
+      first16_nbits[i] = nbits[i];
+      last16_nbits[i] = nbits[240 + i];
+    }
+    mid_nbits = nbits[16];
+    for (size_t i = 16; i < 240; i++) {
+      assert(nbits[i] == mid_nbits);
+    }
+    // Construct lz77 lookup tables.
+    for (size_t i = 0; i < 29; i++) {
+      for (size_t j = 0; j < (1U << kLZ77NBits[i]); j++) {
+        lz77_length_nbits[kLZ77Base[i] + j] = nbits[257 + i] + kLZ77NBits[i];
+        lz77_length_sym[kLZ77Base[i] + j] = 257 + i;
+      }
+    }
+
+    dist_nbits = 1;
+
+    approx_nbits[0] =
+        nbits[0] - 1; // subtract 1 as a fudge for catering for RLE
+    for (size_t i = 1; i < 15; i++) {
+      approx_nbits[i] = (nbits[i] + nbits[256 - i] + 1) / 2;
+    }
+    approx_nbits[15] = mid_nbits;
+  }
+
+  void FillBits() {
+    uint16_t bits[286];
+    ComputeCanonicalCode(nbits, bits);
+    for (size_t i = 0; i < 16; i++) {
+      first16_bits[i] = bits[i];
+      last16_bits[i] = bits[240 + i];
+    }
+    mid_lowbits[0] = mid_lowbits[15] = 0;
+    for (size_t i = 16; i < 240; i += 16) {
+      mid_lowbits[i / 16] = bits[i] & ((1U << (mid_nbits - 4)) - 1);
+    }
+    for (size_t i = 16; i < 240; i++) {
+      assert((uint32_t(mid_lowbits[i / 16]) |
+              (kBitReverseNibbleLookup[i % 16] << (mid_nbits - 4))) == bits[i]);
+    }
+    end_bits = bits[256];
+    // Construct lz77 lookup tables.
+    for (size_t i = 0; i < 29; i++) {
+      for (size_t j = 0; j < (1U << kLZ77NBits[i]); j++) {
+        lz77_length_bits[kLZ77Base[i] + j] =
+            bits[257 + i] | (j << nbits[257 + i]);
+      }
+    }
+
+    dist_bits = 0;
+  }
+
+  HuffmanTable(const uint64_t *collected_data) {
+    ComputeNBits(collected_data);
+    FillNBits();
+    FillBits();
+  }
+
+  // estimate for CollectSymbolCounts
+  // only fills nbits; skips computing actual codes
+  HuffmanTable() {
+    // the following is similar to ComputeNBits(0, 0, 0 ...), but much faster
+    constexpr uint8_t collapsed_nbits[] = {
+        2,  3,  4,  5,  5,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,
+        8,  8,  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  5,  5,  4,  3,
+
+        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 8};
+    for (size_t i = 0; i < 16; i++) {
+      nbits[i] = collapsed_nbits[i];
+      nbits[240 + i] = collapsed_nbits[16 + i];
+    }
+    for (size_t i = 16; i < 240; i++) {
+      nbits[i] = 12;
+    }
+    for (size_t i = 0; i < 30; i++) {
+      nbits[256 + i] = collapsed_nbits[32 + i];
+    }
+
+    FillNBits();
+  }
+};
+
+struct BitWriter {
+  void Write(uint32_t count, uint64_t bits) {
+    buffer |= bits << bits_in_buffer;
+    bits_in_buffer += count;
+    memcpy(data + bytes_written, &buffer, 8);
+    size_t bytes_in_buffer = bits_in_buffer / 8;
+    bits_in_buffer &= 7;
+    buffer >>= bytes_in_buffer * 8;
+    bytes_written += bytes_in_buffer;
+  }
+
+  void ZeroPadToByte() {
+    if (bits_in_buffer != 0) {
+      Write(8 - bits_in_buffer, 0);
+    }
+  }
+
+  void WriteBytes(const char *data, size_t count) {
+    memcpy(this->data + bytes_written, data, count);
+    bytes_written += count;
+  }
+
+  unsigned char *data;
+  size_t bytes_written = 0;
+  size_t bits_in_buffer = 0;
+  uint64_t buffer = 0;
+};
+
+static void WriteHuffmanCode(const HuffmanTable &table,
+                             BitWriter *__restrict writer) {
+  constexpr uint8_t kCodeLengthNbits[] = {
+      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0,
+  };
+  constexpr uint8_t kCodeLengthOrder[] = {
+      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15,
+  };
+  writer->Write(5, 29); // all lit/len codes
+  writer->Write(5, 0);  // distance code up to dist, included
+  writer->Write(4, 15); // all code length codes
+  for (size_t i = 0; i < 19; i++) {
+    writer->Write(3, kCodeLengthNbits[kCodeLengthOrder[i]]);
+  }
+
+  for (size_t i = 0; i < 286; i++) {
+    writer->Write(4, kBitReverseNibbleLookup[table.nbits[i]]);
+  }
+  writer->Write(4, 0b1000);
+}
+
+#ifdef __PCLMUL__
+} // namespace
+#include <wmmintrin.h>
+namespace {
+alignas(32) static const uint8_t pshufb_shf_table[] = {
+    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
+    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+    0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
+
+class Crc32 {
+  __m128i x0, x1, x2, x3;
+
+  static inline __m128i double_xor(__m128i a, __m128i b, __m128i c) {
+#ifdef __AVX512VL__
+    return _mm_ternarylogic_epi32(a, b, c, 0x96);
+#else
+    return _mm_xor_si128(_mm_xor_si128(a, b), c);
+#endif
+  }
+  static inline __m128i do_one_fold(__m128i src, __m128i data) {
+    const auto k1k2 = _mm_set_epi32(1, 0x54442bd4, 1, 0xc6e41596);
+    return double_xor(_mm_clmulepi64_si128(src, k1k2, 0x01),
+                      _mm_clmulepi64_si128(src, k1k2, 0x10), data);
+  }
+
+public:
+  Crc32() {
+    x0 = _mm_cvtsi32_si128(0x9db42487);
+    x1 = _mm_setzero_si128();
+    x2 = _mm_setzero_si128();
+    x3 = _mm_setzero_si128();
+  }
+  size_t update(const unsigned char *__restrict data, size_t len) {
+    auto amount = len & ~63;
+    for (size_t i = 0; i < amount; i += 64) {
+      x0 = do_one_fold(x0, _mm_loadu_si128((__m128i *)(data + i)));
+      x1 = do_one_fold(x1, _mm_loadu_si128((__m128i *)(data + i + 0x10)));
+      x2 = do_one_fold(x2, _mm_loadu_si128((__m128i *)(data + i + 0x20)));
+      x3 = do_one_fold(x3, _mm_loadu_si128((__m128i *)(data + i + 0x30)));
+    }
+    return amount;
+  }
+  uint32_t update_final(const unsigned char *__restrict data, size_t len) {
+    if (len >= 64) {
+      update(data, len);
+      data += len & ~63;
+      len &= 63;
+    }
+
+    if (len >= 48) {
+      auto t3 = x3;
+      x3 = do_one_fold(x2, _mm_loadu_si128((__m128i *)data + 2));
+      x2 = do_one_fold(x1, _mm_loadu_si128((__m128i *)data + 1));
+      x1 = do_one_fold(x0, _mm_loadu_si128((__m128i *)data));
+      x0 = t3;
+    } else if (len >= 32) {
+      auto t2 = x2;
+      auto t3 = x3;
+      x3 = do_one_fold(x1, _mm_loadu_si128((__m128i *)data + 1));
+      x2 = do_one_fold(x0, _mm_loadu_si128((__m128i *)data));
+      x1 = t3;
+      x0 = t2;
+    } else if (len >= 16) {
+      auto t3 = x3;
+      x3 = do_one_fold(x0, _mm_loadu_si128((__m128i *)data));
+      x0 = x1;
+      x1 = x2;
+      x2 = t3;
+    }
+    data += len & 48;
+    len &= 15;
+
+    if (len > 0) {
+      auto xmm_shl = _mm_loadu_si128((__m128i *)(pshufb_shf_table + len));
+      auto xmm_shr = _mm_xor_si128(xmm_shl, _mm_set1_epi8(-128));
+
+      auto t0 = _mm_loadu_si128((__m128i *)data);
+      auto t1 = _mm_shuffle_epi8(x0, xmm_shl);
+
+      x0 = _mm_or_si128(_mm_shuffle_epi8(x0, xmm_shr),
+                        _mm_shuffle_epi8(x1, xmm_shl));
+      x1 = _mm_or_si128(_mm_shuffle_epi8(x1, xmm_shr),
+                        _mm_shuffle_epi8(x2, xmm_shl));
+      x2 = _mm_or_si128(_mm_shuffle_epi8(x2, xmm_shr),
+                        _mm_shuffle_epi8(x3, xmm_shl));
+      x3 = _mm_or_si128(_mm_shuffle_epi8(x3, xmm_shr),
+                        _mm_shuffle_epi8(t0, xmm_shl));
+
+      x3 = do_one_fold(t1, x3);
+    }
+
+    const auto k3k4 = _mm_set_epi32(1, 0x751997d0, 0, 0xccaa009e);
+    const auto k5k4 = _mm_set_epi32(1, 0x63cd6124, 0, 0xccaa009e);
+    const auto poly = _mm_set_epi32(1, 0xdb710640, 0, 0xf7011641);
+
+    x0 = double_xor(x1, _mm_clmulepi64_si128(x0, k3k4, 0x10),
+                    _mm_clmulepi64_si128(x0, k3k4, 0x01));
+    x0 = double_xor(x2, _mm_clmulepi64_si128(x0, k3k4, 0x10),
+                    _mm_clmulepi64_si128(x0, k3k4, 0x01));
+    x0 = double_xor(x3, _mm_clmulepi64_si128(x0, k3k4, 0x10),
+                    _mm_clmulepi64_si128(x0, k3k4, 0x01));
+
+    x1 =
+        _mm_xor_si128(_mm_clmulepi64_si128(x0, k5k4, 0), _mm_srli_si128(x0, 8));
+
+    x0 = _mm_slli_si128(x1, 4);
+    x0 = _mm_clmulepi64_si128(x0, k5k4, 0x10);
+#ifdef __AVX512VL__
+    x0 = _mm_ternarylogic_epi32(x0, x1, _mm_set_epi32(0, -1, -1, 0), 0x28);
+#else
+    x1 = _mm_and_si128(x1, _mm_set_epi32(0, -1, -1, 0));
+    x0 = _mm_xor_si128(x0, x1);
+#endif
+
+    x1 = _mm_clmulepi64_si128(x0, poly, 0);
+    x1 = _mm_clmulepi64_si128(x1, poly, 0x10);
+#ifdef __AVX512VL__
+    x1 = _mm_ternarylogic_epi32(x1, x0, x0, 0xC3); // NOT(XOR(x1, x0))
+#else
+    x0 = _mm_xor_si128(x0, _mm_set_epi32(0, -1, -1, 0));
+    x1 = _mm_xor_si128(x1, x0);
+#endif
+    return _mm_extract_epi32(x1, 2);
+  }
+};
+#else
+} // namespace
+#include <array>
+#include <cstddef>
+#include <utility>
+namespace {
+// from https://joelfilho.com/blog/2020/compile_time_lookup_tables_in_cpp/
+template <std::size_t Length, typename Generator, std::size_t... Indexes>
+constexpr auto lut_impl(Generator &&f, std::index_sequence<Indexes...>) {
+  using content_type = decltype(f(std::size_t{0}));
+  return std::array<content_type, Length>{{f(Indexes)...}};
+}
+template <std::size_t Length, typename Generator>
+constexpr auto lut(Generator &&f) {
+  return lut_impl<Length>(std::forward<Generator>(f),
+                          std::make_index_sequence<Length>{});
+}
+
+constexpr uint32_t crc32_slice8_gen(unsigned n) {
+  uint32_t crc = n & 0xff;
+  for (int i = n >> 8; i >= 0; i--) {
+    for (int j = 0; j < 8; j++) {
+      crc = (crc >> 1) ^
+            ((crc & 1) * 0xEDB88320); // 0xEDB88320 = CRC32 polynomial
+    }
+  }
+  return crc;
+}
+static constexpr auto kCrcSlice8LUT = lut<256 * 8>(crc32_slice8_gen);
+
+class Crc32 {
+  uint32_t state;
+
+  // this is based off Fast CRC32 slice-by-8:
+  // https://create.stephan-brumme.com/crc32/
+  static inline uint32_t crc_process_iter(uint32_t crc,
+                                          const uint32_t *current) {
+    uint32_t one = *current++ ^ crc;
+    uint32_t two = *current;
+    return kCrcSlice8LUT[(two >> 24) & 0xFF] ^
+           kCrcSlice8LUT[0x100 + ((two >> 16) & 0xFF)] ^
+           kCrcSlice8LUT[0x200 + ((two >> 8) & 0xFF)] ^
+           kCrcSlice8LUT[0x300 + (two & 0xFF)] ^
+           kCrcSlice8LUT[0x400 + ((one >> 24) & 0xFF)] ^
+           kCrcSlice8LUT[0x500 + ((one >> 16) & 0xFF)] ^
+           kCrcSlice8LUT[0x600 + ((one >> 8) & 0xFF)] ^
+           kCrcSlice8LUT[0x700 + (one & 0xFF)];
+  }
+
+public:
+  Crc32() : state(0xffffffff) {}
+  size_t update(const unsigned char *__restrict data, size_t len) {
+    auto amount = len & ~7;
+    for (size_t i = 0; i < amount; i += 8) {
+      state = crc_process_iter(state, (uint32_t *)(data + i));
+    }
+    return amount;
+  }
+  uint32_t update_final(const unsigned char *__restrict data, size_t len) {
+    auto i = update(data, len);
+    for (; i < len; i++) {
+      state = (state >> 8) ^ kCrcSlice8LUT[(state & 0xFF) ^ data[i]];
+    }
+    return ~state;
+  }
+};
+
+#endif
+
+constexpr unsigned kAdler32Mod = 65521;
+
+static void UpdateAdler32(uint32_t &s1, uint32_t &s2, uint8_t byte) {
+  s1 += byte;
+  s2 += s1;
+  s1 %= kAdler32Mod;
+  s2 %= kAdler32Mod;
+}
+
+static uint32_t hadd(MIVEC v) {
+  auto sum =
+#ifdef __AVX2__
+      _mm_add_epi32(_mm256_castsi256_si128(v), _mm256_extracti128_si256(v, 1));
+#else
+      v;
+#endif
+  sum = _mm_hadd_epi32(sum, sum);
+  sum = _mm_hadd_epi32(sum, sum);
+  return _mm_cvtsi128_si32(sum);
+}
+
+template <size_t predictor>
+static FORCE_INLINE MIVEC PredictVec(const unsigned char *current_buf,
+                                     const unsigned char *top_buf,
+                                     const unsigned char *left_buf,
+                                     const unsigned char *topleft_buf) {
+  auto data = MMSI(load)((MIVEC *)(current_buf));
+  if (predictor == 0) {
+    return data;
+  } else if (predictor == 1) {
+    auto pred = MMSI(loadu)((MIVEC *)(left_buf));
+    return MM(sub_epi8)(data, pred);
+  } else if (predictor == 2) {
+    auto pred = MMSI(load)((MIVEC *)(top_buf));
+    return MM(sub_epi8)(data, pred);
+  } else if (predictor == 3) {
+    auto left = MMSI(loadu)((MIVEC *)(left_buf));
+    auto top = MMSI(load)((MIVEC *)(top_buf));
+    auto pred = MM(sub_epi8)(MM(add_epi8)(top, left), MM(avg_epu8)(top, left));
+    return MM(sub_epi8)(data, pred);
+  } else {
+    auto left = MMSI(loadu)((MIVEC *)(left_buf));
+    auto top = MMSI(load)((MIVEC *)(top_buf));
+    auto c = MMSI(loadu)((MIVEC *)(topleft_buf));
+
+    auto a = MM(min_epu8)(left, top);
+    auto b = MM(max_epu8)(left, top);
+
+    auto pa = MM(subs_epu8)(b, c);
+    auto pb = MM(subs_epu8)(c, a);
+
+    auto min_pab = MM(min_epu8)(pa, pb);
+    auto pc = MM(sub_epi8)(MM(max_epu8)(pa, pb), min_pab);
+
+    auto min_pabc = MM(min_epu8)(min_pab, pc);
+    auto use_a = MM(cmpeq_epi8)(min_pabc, pa);
+    auto use_b = MM(cmpeq_epi8)(min_pabc, pb);
+
+    auto pred = MM(blendv_epi8)(MM(blendv_epi8)(c, b, use_b), a, use_a);
+    return MM(sub_epi8)(data, pred);
+  }
+}
+
+alignas(SIMD_WIDTH) constexpr int32_t _kMaskVec[] = {0,  0,  0,  0,
+#if SIMD_WIDTH == 32
+                                                     0,  0,  0,  0,
+                                                     -1, -1, -1, -1,
+#endif
+                                                     -1, -1, -1, -1};
+static const uint8_t *kMaskVec =
+    reinterpret_cast<const uint8_t *>(_kMaskVec) + SIMD_WIDTH;
+
+template <size_t predictor, typename CB, typename CB_ADL, typename CB_RLE>
+static void
+ProcessRow(size_t bytes_per_line, const unsigned char *current_row_buf,
+           const unsigned char *top_buf, const unsigned char *left_buf,
+           const unsigned char *topleft_buf, CB &&cb, CB_ADL &&cb_adl,
+           CB_RLE &&cb_rle) {
+  size_t run = 0;
+  size_t i = 0;
+  for (; i + SIMD_WIDTH <= bytes_per_line; i += SIMD_WIDTH) {
+    auto pdata = PredictVec<predictor>(current_row_buf + i, top_buf + i,
+                                       left_buf + i, topleft_buf + i);
+    unsigned pdatais0 =
+        MM(movemask_epi8)(MM(cmpeq_epi8)(pdata, MMSI(setzero)()));
+    if (pdatais0 == SIMD_MASK) {
+      run += SIMD_WIDTH;
+    } else {
+      if (run != 0) {
+        cb_rle(run);
+      }
+      run = 0;
+      cb(pdata, SIMD_WIDTH);
+    }
+    cb_adl(pdata, SIMD_WIDTH, i);
+  }
+  size_t bytes_remaining =
+      bytes_per_line ^ i; // equivalent to `bytes_per_line - i`
+  if (bytes_remaining) {
+    auto pdata = PredictVec<predictor>(current_row_buf + i, top_buf + i,
+                                       left_buf + i, topleft_buf + i);
+    unsigned pdatais0 =
+        MM(movemask_epi8)(MM(cmpeq_epi8)(pdata, MMSI(setzero)()));
+    auto mask = (1UL << bytes_remaining) - 1;
+
+    if ((pdatais0 & mask) == mask && run + bytes_remaining >= 16) {
+      run += bytes_remaining;
+    } else {
+      if (run != 0) {
+        cb_rle(run);
+      }
+      run = 0;
+      cb(pdata, bytes_remaining);
+    }
+    cb_adl(pdata, bytes_remaining, i);
+  }
+  if (run != 0) {
+    cb_rle(run);
+  }
+}
+
+template <typename CB, typename CB_ADL, typename CB_RLE>
+static void
+ProcessRow(uint8_t predictor, size_t bytes_per_line,
+           const unsigned char *current_row_buf, const unsigned char *top_buf,
+           const unsigned char *left_buf, const unsigned char *topleft_buf,
+           CB &&cb, CB_ADL &&cb_adl, CB_RLE &&cb_rle) {
+  if (predictor == 1) {
+    ProcessRow<1>(bytes_per_line, current_row_buf, top_buf, left_buf,
+                  topleft_buf, cb, cb_adl, cb_rle);
+  } else if (predictor == 2) {
+    ProcessRow<2>(bytes_per_line, current_row_buf, top_buf, left_buf,
+                  topleft_buf, cb, cb_adl, cb_rle);
+  } else if (predictor == 3) {
+    ProcessRow<3>(bytes_per_line, current_row_buf, top_buf, left_buf,
+                  topleft_buf, cb, cb_adl, cb_rle);
+  } else if (predictor == 4) {
+    ProcessRow<4>(bytes_per_line, current_row_buf, top_buf, left_buf,
+                  topleft_buf, cb, cb_adl, cb_rle);
+  } else {
+    assert(predictor == 0);
+    ProcessRow<0>(bytes_per_line, current_row_buf, top_buf, left_buf,
+                  topleft_buf, cb, cb_adl, cb_rle);
+  }
+}
+
+template <typename CB> static void ForAllRLESymbols(size_t length, CB &&cb) {
+  assert(length >= 4);
+  length -= 1;
+
+  if (length < 258) {
+    // fast path if long sequences are rare in the image
+    cb(length, 1);
+  } else {
+    auto runs = length / 258;
+    auto remain = length % 258;
+    if (remain == 1 || remain == 2) {
+      remain += 258 - 3;
+      runs--;
+      cb(3, 1);
+    }
+    if (runs) {
+      cb(258, runs);
+    }
+    if (remain) {
+      cb(remain, 1);
+    }
+  }
+}
+
+template <size_t pred, bool store_pred>
+static void
+TryPredictor(size_t bytes_per_line, const unsigned char *current_row_buf,
+             const unsigned char *top_buf, const unsigned char *left_buf,
+             const unsigned char *topleft_buf, unsigned char *predicted_data,
+             const HuffmanTable &table, size_t &best_cost, uint8_t &predictor) {
+  size_t cost_rle = 0;
+  MIVEC cost_direct = MMSI(setzero)();
+  auto cost_chunk_cb = [&](const MIVEC bytes,
+                           const size_t bytes_in_vec) FORCE_INLINE_LAMBDA {
+    auto data_for_lut = MMSI(and)(MM(set1_epi8)(0xF), bytes);
+    // get a mask of `bytes` that are between -16 and 15 inclusive
+    // (`-16 <= bytes <= 15` is equivalent to `bytes + 112 > 95`)
+    auto use_lowhi = MM(cmpgt_epi8)(MM(add_epi8)(bytes, MM(set1_epi8)(112)),
+                                    MM(set1_epi8)(95));
+
+    auto nbits_low16 = MM(shuffle_epi8)(
+        BCAST128(_mm_load_si128((__m128i *)table.first16_nbits)), data_for_lut);
+    auto nbits_hi16 = MM(shuffle_epi8)(
+        BCAST128(_mm_load_si128((__m128i *)table.last16_nbits)), data_for_lut);
+
+    auto nbits = MM(blendv_epi8)(nbits_low16, nbits_hi16, bytes);
+    nbits = MM(blendv_epi8)(MM(set1_epi8)(table.mid_nbits), nbits, use_lowhi);
+
+    auto nbits_discard =
+        MMSI(and)(nbits, MMSI(loadu)((MIVEC *)(kMaskVec - bytes_in_vec)));
+
+    cost_direct =
+        MM(add_epi32)(cost_direct, MM(sad_epu8)(nbits, nbits_discard));
+  };
+  auto rle_cost_cb = [&](size_t run) {
+    cost_rle += table.first16_nbits[0];
+    ForAllRLESymbols(run, [&](size_t len, size_t count) {
+      cost_rle += (table.dist_nbits + table.lz77_length_nbits[len]) * count;
+    });
+  };
+  if (store_pred) {
+    ProcessRow<pred>(
+        bytes_per_line, current_row_buf, top_buf, left_buf, topleft_buf,
+        cost_chunk_cb,
+        [=](const MIVEC pdata, size_t, size_t i) {
+          MMSI(store)((MIVEC *)(predicted_data + i), pdata);
+        },
+        rle_cost_cb);
+  } else {
+    ProcessRow<pred>(
+        bytes_per_line, current_row_buf, top_buf, left_buf, topleft_buf,
+        cost_chunk_cb, [](const MIVEC, size_t, size_t) {}, rle_cost_cb);
+  }
+  size_t cost = cost_rle + hadd(cost_direct);
+  if (cost < best_cost) {
+    best_cost = cost;
+    predictor = pred;
+  }
+}
+
+static FORCE_INLINE void WriteBitsLong(MIVEC nbits, MIVEC bits_lo,
+                                       MIVEC bits_hi, size_t mid_lo_nbits,
+                                       BitWriter *__restrict writer) {
+
+  // Merge bits_lo and bits_hi in 16-bit "bits".
+#if FPNGE_USE_PEXT
+  auto bits0 = MM(unpacklo_epi8)(bits_lo, bits_hi);
+  auto bits1 = MM(unpackhi_epi8)(bits_lo, bits_hi);
+
+  // convert nbits into a mask
+  auto nbits_hi = MM(sub_epi8)(nbits, MM(set1_epi8)(mid_lo_nbits));
+  auto nbits0 = MM(unpacklo_epi8)(nbits, nbits_hi);
+  auto nbits1 = MM(unpackhi_epi8)(nbits, nbits_hi);
+  const auto nbits_to_mask =
+      BCAST128(_mm_set_epi32(0xffffffff, 0xffffffff, 0x7f3f1f0f, 0x07030100));
+  auto bitmask0 = MM(shuffle_epi8)(nbits_to_mask, nbits0);
+  auto bitmask1 = MM(shuffle_epi8)(nbits_to_mask, nbits1);
+
+  // aggregate nbits
+  alignas(16) uint16_t nbits_a[SIMD_WIDTH / 4];
+  auto bit_count = MM(maddubs_epi16)(nbits, MM(set1_epi8)(1));
+#ifdef __AVX2__
+  auto bit_count2 = _mm_hadd_epi16(_mm256_castsi256_si128(bit_count),
+                                   _mm256_extracti128_si256(bit_count, 1));
+  _mm_store_si128((__m128i *)nbits_a, bit_count2);
+#else
+  bit_count = _mm_hadd_epi16(bit_count, bit_count);
+  _mm_storel_epi64((__m128i *)nbits_a, bit_count);
+#endif
+
+  alignas(SIMD_WIDTH) uint64_t bitmask_a[SIMD_WIDTH / 4];
+  MMSI(store)((MIVEC *)bitmask_a, bitmask0);
+  MMSI(store)((MIVEC *)bitmask_a + 1, bitmask1);
+
+#else
+
+  auto nbits0 = MM(unpacklo_epi8)(nbits, MMSI(setzero)());
+  auto nbits1 = MM(unpackhi_epi8)(nbits, MMSI(setzero)());
+  MIVEC bits0, bits1;
+  if (mid_lo_nbits == 8) {
+    bits0 = MM(unpacklo_epi8)(bits_lo, bits_hi);
+    bits1 = MM(unpackhi_epi8)(bits_lo, bits_hi);
+  } else {
+    auto nbits_shift = _mm_cvtsi32_si128(8 - mid_lo_nbits);
+    auto bits_lo_shifted = MM(sll_epi16)(bits_lo, nbits_shift);
+    bits0 = MM(unpacklo_epi8)(bits_lo_shifted, bits_hi);
+    bits1 = MM(unpackhi_epi8)(bits_lo_shifted, bits_hi);
+
+    bits0 = MM(srl_epi16)(bits0, nbits_shift);
+    bits1 = MM(srl_epi16)(bits1, nbits_shift);
+  }
+
+  // 16 -> 32
+  auto nbits0_32_lo = MMSI(and)(nbits0, MM(set1_epi32)(0xFFFF));
+  auto nbits1_32_lo = MMSI(and)(nbits1, MM(set1_epi32)(0xFFFF));
+
+  auto bits0_32_lo = MMSI(and)(bits0, MM(set1_epi32)(0xFFFF));
+  auto bits1_32_lo = MMSI(and)(bits1, MM(set1_epi32)(0xFFFF));
+#ifdef __AVX2__
+  auto bits0_32_hi = MM(sllv_epi32)(MM(srli_epi32)(bits0, 16), nbits0_32_lo);
+  auto bits1_32_hi = MM(sllv_epi32)(MM(srli_epi32)(bits1, 16), nbits1_32_lo);
+#else
+  // emulate variable shift by abusing float exponents
+  // this works because Huffman symbols are not allowed to exceed 15 bits, so
+  // will fit within a float's mantissa and (number << 15) won't overflow when
+  // converted back to a signed int
+  auto bits0_32_hi =
+      _mm_castps_si128(MM(cvtepi32_ps)(MM(srli_epi32)(bits0, 16)));
+  auto bits1_32_hi =
+      _mm_castps_si128(MM(cvtepi32_ps)(MM(srli_epi32)(bits1, 16)));
+
+  // add shift amount to the exponent
+  bits0_32_hi = MM(add_epi32)(bits0_32_hi, MM(slli_epi32)(nbits0_32_lo, 23));
+  bits1_32_hi = MM(add_epi32)(bits1_32_hi, MM(slli_epi32)(nbits1_32_lo, 23));
+
+  bits0_32_hi = MM(cvtps_epi32)(_mm_castsi128_ps(bits0_32_hi));
+  bits1_32_hi = MM(cvtps_epi32)(_mm_castsi128_ps(bits1_32_hi));
+#endif
+
+  nbits0 = MM(madd_epi16)(nbits0, MM(set1_epi16)(1));
+  nbits1 = MM(madd_epi16)(nbits1, MM(set1_epi16)(1));
+  auto bits0_32 = MMSI(or)(bits0_32_lo, bits0_32_hi);
+  auto bits1_32 = MMSI(or)(bits1_32_lo, bits1_32_hi);
+
+  // 32 -> 64
+#ifdef __AVX2__
+  auto nbits_inv0_64_lo = MM(subs_epu8)(MM(set1_epi64x)(32), nbits0);
+  auto nbits_inv1_64_lo = MM(subs_epu8)(MM(set1_epi64x)(32), nbits1);
+  bits0 = MM(sllv_epi32)(bits0_32, nbits_inv0_64_lo);
+  bits1 = MM(sllv_epi32)(bits1_32, nbits_inv1_64_lo);
+  bits0 = MM(srlv_epi64)(bits0, nbits_inv0_64_lo);
+  bits1 = MM(srlv_epi64)(bits1, nbits_inv1_64_lo);
+#else
+  auto nbits0_64_lo = MMSI(and)(nbits0, MM(set1_epi64x)(0xFFFFFFFF));
+  auto nbits1_64_lo = MMSI(and)(nbits1, MM(set1_epi64x)(0xFFFFFFFF));
+  // just do two shifts for SSE variant
+  auto bits0_64_lo = MMSI(and)(bits0_32, MM(set1_epi64x)(0xFFFFFFFF));
+  auto bits1_64_lo = MMSI(and)(bits1_32, MM(set1_epi64x)(0xFFFFFFFF));
+  auto bits0_64_hi = MM(srli_epi64)(bits0_32, 32);
+  auto bits1_64_hi = MM(srli_epi64)(bits1_32, 32);
+
+  bits0_64_hi = _mm_blend_epi16(
+      _mm_sll_epi64(bits0_64_hi, nbits0_64_lo),
+      _mm_sll_epi64(bits0_64_hi,
+                    _mm_unpackhi_epi64(nbits0_64_lo, nbits0_64_lo)),
+      0xf0);
+  bits1_64_hi = _mm_blend_epi16(
+      _mm_sll_epi64(bits1_64_hi, nbits1_64_lo),
+      _mm_sll_epi64(bits1_64_hi,
+                    _mm_unpackhi_epi64(nbits1_64_lo, nbits1_64_lo)),
+      0xf0);
+
+  bits0 = MMSI(or)(bits0_64_lo, bits0_64_hi);
+  bits1 = MMSI(or)(bits1_64_lo, bits1_64_hi);
+#endif
+
+  auto nbits01 = MM(hadd_epi32)(nbits0, nbits1);
+
+  // nbits_a <= 40 as we have at most 10 bits per symbol, so the call to the
+  // writer is safe.
+  alignas(SIMD_WIDTH) uint32_t nbits_a[SIMD_WIDTH / 4];
+  MMSI(store)((MIVEC *)nbits_a, nbits01);
+
+#endif
+
+  alignas(SIMD_WIDTH) uint64_t bits_a[SIMD_WIDTH / 4];
+  MMSI(store)((MIVEC *)bits_a, bits0);
+  MMSI(store)((MIVEC *)bits_a + 1, bits1);
+
+#ifdef __AVX2__
+  constexpr uint8_t kPerm[] = {0, 1, 4, 5, 2, 3, 6, 7};
+#else
+  constexpr uint8_t kPerm[] = {0, 1, 2, 3};
+#endif
+
+  for (size_t ii = 0; ii < SIMD_WIDTH / 4; ii++) {
+    uint64_t bits = bits_a[kPerm[ii]];
+#if FPNGE_USE_PEXT
+    bits = _pext_u64(bits, bitmask_a[kPerm[ii]]);
+#endif
+    auto count = nbits_a[ii];
+    writer->Write(count, bits);
+  }
+}
+
+// as above, but where nbits <= 8, so we can ignore bits_hi
+static FORCE_INLINE void WriteBitsShort(MIVEC nbits, MIVEC bits,
+                                        BitWriter *__restrict writer) {
+
+#if FPNGE_USE_PEXT
+  // convert nbits into a mask
+  auto bitmask = MM(shuffle_epi8)(
+      BCAST128(_mm_set_epi32(0xffffffff, 0xffffffff, 0x7f3f1f0f, 0x07030100)),
+      nbits);
+  auto bit_count = MM(sad_epu8)(nbits, MMSI(setzero)());
+
+  alignas(SIMD_WIDTH) uint64_t nbits_a[SIMD_WIDTH / 8];
+  MMSI(store)((MIVEC *)nbits_a, bit_count);
+  alignas(SIMD_WIDTH) uint64_t bits_a[SIMD_WIDTH / 8];
+  MMSI(store)((MIVEC *)bits_a, bits);
+  alignas(SIMD_WIDTH) uint64_t bitmask_a[SIMD_WIDTH / 8];
+  MMSI(store)((MIVEC *)bitmask_a, bitmask);
+#else
+  // 8 -> 16
+  auto prod = MM(slli_epi16)(
+      MM(shuffle_epi8)(BCAST128(_mm_set_epi32(
+                           //  since we can't handle 8 bits, we'll under-shift
+                           //  it and do an extra shift later on
+                           -1, 0xffffff80, 0x40201008, 0x040201ff)),
+                       nbits),
+      8);
+  auto bits_hi =
+      MM(mulhi_epu16)(MMSI(andnot)(MM(set1_epi16)(0xff), bits), prod);
+  bits_hi = MM(add_epi16)(bits_hi, bits_hi); // fix under-shifting
+  bits = MMSI(or)(MMSI(and)(bits, MM(set1_epi16)(0xff)), bits_hi);
+  nbits = MM(maddubs_epi16)(nbits, MM(set1_epi8)(1));
+
+  // 16 -> 32
+  auto nbits_32_lo = MMSI(and)(nbits, MM(set1_epi32)(0xFFFF));
+  auto bits_32_lo = MMSI(and)(bits, MM(set1_epi32)(0xFFFF));
+  auto bits_32_hi = MM(srli_epi32)(bits, 16);
+#ifdef __AVX2__
+  bits_32_hi = MM(sllv_epi32)(bits_32_hi, nbits_32_lo);
+#else
+  // need to avoid overflow when converting float -> int, because it converts to
+  // a signed int; do this by offsetting the shift by 1
+  nbits_32_lo = MM(add_epi16)(nbits_32_lo, MM(set1_epi32)(0xFFFF));
+  bits_32_hi = _mm_castps_si128(MM(cvtepi32_ps)(bits_32_hi));
+  bits_32_hi = MM(add_epi32)(bits_32_hi, MM(slli_epi32)(nbits_32_lo, 23));
+  bits_32_hi = MM(cvtps_epi32)(_mm_castsi128_ps(bits_32_hi));
+  bits_32_hi = MM(add_epi32)(bits_32_hi, bits_32_hi); // fix under-shifting
+#endif
+  nbits = MM(madd_epi16)(nbits, MM(set1_epi16)(1));
+  bits = MMSI(or)(bits_32_lo, bits_32_hi);
+
+  // 32 -> 64
+#ifdef __AVX2__
+  auto nbits_inv_64_lo = MM(subs_epu8)(MM(set1_epi64x)(32), nbits);
+  bits = MM(sllv_epi32)(bits, nbits_inv_64_lo);
+  bits = MM(srlv_epi64)(bits, nbits_inv_64_lo);
+#else
+  auto nbits_64_lo = MMSI(and)(nbits, MM(set1_epi64x)(0xFFFFFFFF));
+  auto bits_64_lo = MMSI(and)(bits, MM(set1_epi64x)(0xFFFFFFFF));
+  auto bits_64_hi = MM(srli_epi64)(bits, 32);
+  bits_64_hi = _mm_blend_epi16(
+      _mm_sll_epi64(bits_64_hi, nbits_64_lo),
+      _mm_sll_epi64(bits_64_hi, _mm_unpackhi_epi64(nbits_64_lo, nbits_64_lo)),
+      0xf0);
+  bits = MMSI(or)(bits_64_lo, bits_64_hi);
+#endif
+
+  auto nbits2 = _mm_hadd_epi32(
+#ifdef __AVX2__
+      _mm256_castsi256_si128(nbits), _mm256_extracti128_si256(nbits, 1)
+#else
+      nbits, nbits
+#endif
+  );
+
+  alignas(16) uint32_t nbits_a[4];
+  alignas(SIMD_WIDTH) uint64_t bits_a[SIMD_WIDTH / 8];
+
+  _mm_store_si128((__m128i *)nbits_a, nbits2);
+  MMSI(store)((MIVEC *)bits_a, bits);
+
+#endif
+
+  for (size_t ii = 0; ii < SIMD_WIDTH / 8; ii++) {
+    uint64_t bits64 = bits_a[ii];
+#if FPNGE_USE_PEXT
+    bits64 = _pext_u64(bits64, bitmask_a[ii]);
+#endif
+    if (nbits_a[ii] + writer->bits_in_buffer > 63) {
+      // hope this case rarely occurs
+      writer->Write(16, bits64 & 0xffff);
+      bits64 >>= 16;
+      nbits_a[ii] -= 16;
+    }
+    writer->Write(nbits_a[ii], bits64);
+  }
+}
+
+static FORCE_INLINE void AddApproxCost(MIVEC &total, MIVEC pdata,
+                                       MIVEC bit_costs) {
+  auto approx_sym = MM(min_epu8)(MM(abs_epi8)(pdata), MM(set1_epi8)(15));
+  auto cost = MM(shuffle_epi8)(bit_costs, approx_sym);
+  total = MM(add_epi64)(total, MM(sad_epu8)(cost, MMSI(setzero)()));
+}
+static FORCE_INLINE void AddApproxCost(MIVEC &total, MIVEC pdata,
+                                       MIVEC bit_costs, MIVEC maskv) {
+  auto approx_sym = MM(min_epu8)(MM(abs_epi8)(pdata), MM(set1_epi8)(15));
+  auto cost = MM(shuffle_epi8)(bit_costs, approx_sym);
+  auto cost_mask = MMSI(and)(maskv, cost);
+  total = MM(add_epi64)(total, MM(sad_epu8)(cost, cost_mask));
+}
+
+static uint8_t
+SelectPredictor(size_t bytes_per_line, const unsigned char *current_row_buf,
+                const unsigned char *top_buf, const unsigned char *left_buf,
+                const unsigned char *topleft_buf, unsigned char *paeth_data,
+                const HuffmanTable &table, const struct FPNGEOptions *options) {
+  if (options->predictor <= 4) {
+    return options->predictor;
+  }
+  if (options->predictor == FPNGE_PREDICTOR_APPROX) {
+    auto bit_costs = BCAST128(_mm_load_si128((__m128i *)(table.approx_nbits)));
+    size_t i = 0;
+    auto cost1 = MMSI(setzero)();
+    auto cost2 = MMSI(setzero)();
+    auto cost3 = MMSI(setzero)();
+    auto cost4 = MMSI(setzero)();
+    MIVEC pdata;
+
+    for (; i + SIMD_WIDTH <= bytes_per_line; i += SIMD_WIDTH) {
+      pdata = PredictVec<1>(current_row_buf + i, top_buf + i, left_buf + i,
+                            topleft_buf + i);
+      AddApproxCost(cost1, pdata, bit_costs);
+
+      pdata = PredictVec<2>(current_row_buf + i, top_buf + i, left_buf + i,
+                            topleft_buf + i);
+      AddApproxCost(cost2, pdata, bit_costs);
+
+      pdata = PredictVec<3>(current_row_buf + i, top_buf + i, left_buf + i,
+                            topleft_buf + i);
+      AddApproxCost(cost3, pdata, bit_costs);
+
+      pdata = PredictVec<4>(current_row_buf + i, top_buf + i, left_buf + i,
+                            topleft_buf + i);
+      AddApproxCost(cost4, pdata, bit_costs);
+      MMSI(store)((MIVEC *)(paeth_data + i), pdata);
+    }
+
+    size_t bytes_remaining =
+        bytes_per_line ^ i; // equivalent to `bytes_per_line - i`
+    if (bytes_remaining) {
+      auto maskv = MMSI(loadu)((MIVEC *)(kMaskVec - bytes_remaining));
+
+      pdata = PredictVec<1>(current_row_buf + i, top_buf + i, left_buf + i,
+                            topleft_buf + i);
+      AddApproxCost(cost1, pdata, bit_costs, maskv);
+
+      pdata = PredictVec<2>(current_row_buf + i, top_buf + i, left_buf + i,
+                            topleft_buf + i);
+      AddApproxCost(cost2, pdata, bit_costs, maskv);
+
+      pdata = PredictVec<3>(current_row_buf + i, top_buf + i, left_buf + i,
+                            topleft_buf + i);
+      AddApproxCost(cost3, pdata, bit_costs, maskv);
+
+      pdata = PredictVec<4>(current_row_buf + i, top_buf + i, left_buf + i,
+                            topleft_buf + i);
+      AddApproxCost(cost4, pdata, bit_costs, maskv);
+      MMSI(store)((MIVEC *)(paeth_data + i), pdata);
+    }
+
+    uint8_t predictor = 1;
+    size_t best_cost = hadd(cost1);
+    auto test_cost = [&](MIVEC costv, uint8_t pred) {
+      size_t cost = hadd(costv);
+      if (cost < best_cost) {
+        best_cost = cost;
+        predictor = pred;
+      }
+    };
+    test_cost(cost2, 2);
+    test_cost(cost3, 3);
+    test_cost(cost4, 4);
+    return predictor;
+  }
+
+  assert(options->predictor == FPNGE_PREDICTOR_BEST);
+  uint8_t predictor;
+  size_t best_cost = ~0U;
+  TryPredictor<1, /*store_pred=*/false>(bytes_per_line, current_row_buf,
+                                        top_buf, left_buf, topleft_buf, nullptr,
+                                        table, best_cost, predictor);
+  TryPredictor<2, /*store_pred=*/false>(bytes_per_line, current_row_buf,
+                                        top_buf, left_buf, topleft_buf, nullptr,
+                                        table, best_cost, predictor);
+  TryPredictor<3, /*store_pred=*/false>(bytes_per_line, current_row_buf,
+                                        top_buf, left_buf, topleft_buf, nullptr,
+                                        table, best_cost, predictor);
+  TryPredictor<4, /*store_pred=*/true>(bytes_per_line, current_row_buf, top_buf,
+                                       left_buf, topleft_buf, paeth_data, table,
+                                       best_cost, predictor);
+  return predictor;
+}
+
+static void
+EncodeOneRow(size_t bytes_per_line, const unsigned char *current_row_buf,
+             const unsigned char *top_buf, const unsigned char *left_buf,
+             const unsigned char *topleft_buf, unsigned char *paeth_data,
+             const HuffmanTable &table, uint32_t &s1, uint32_t &s2,
+             BitWriter *__restrict writer, const struct FPNGEOptions *options) {
+  uint8_t predictor =
+      SelectPredictor(bytes_per_line, current_row_buf, top_buf, left_buf,
+                      topleft_buf, paeth_data, table, options);
+
+  writer->Write(table.first16_nbits[predictor], table.first16_bits[predictor]);
+  UpdateAdler32(s1, s2, predictor);
+
+  auto adler_accum_s1 = INT2VEC(s1);
+  auto adler_accum_s2 = INT2VEC(s2);
+  auto adler_s1_sum = MMSI(setzero)();
+
+  uint16_t bytes_since_flush = 1;
+
+  auto flush_adler = [&]() {
+    adler_accum_s2 = MM(add_epi32)(
+        adler_accum_s2, MM(slli_epi32)(adler_s1_sum, SIMD_WIDTH == 32 ? 5 : 4));
+    adler_s1_sum = MMSI(setzero)();
+
+    uint32_t ls1 = hadd(adler_accum_s1);
+    uint32_t ls2 = hadd(adler_accum_s2);
+    ls1 %= kAdler32Mod;
+    ls2 %= kAdler32Mod;
+    s1 = ls1;
+    s2 = ls2;
+    adler_accum_s1 = INT2VEC(s1);
+    adler_accum_s2 = INT2VEC(s2);
+    bytes_since_flush = 0;
+  };
+
+  auto encode_chunk_cb = [&](const MIVEC bytes, const size_t bytes_in_vec) {
+    auto maskv = MMSI(loadu)((MIVEC *)(kMaskVec - bytes_in_vec));
+
+    auto data_for_lut = MMSI(and)(MM(set1_epi8)(0xF), bytes);
+    data_for_lut = MMSI(or)(data_for_lut, maskv);
+    // get a mask of `bytes` that are between -16 and 15 inclusive
+    // (`-16 <= bytes <= 15` is equivalent to `bytes + 112 > 95`)
+    auto use_lowhi = MM(cmpgt_epi8)(MM(add_epi8)(bytes, MM(set1_epi8)(112)),
+                                    MM(set1_epi8)(95));
+
+    auto nbits_low16 = MM(shuffle_epi8)(
+        BCAST128(_mm_load_si128((__m128i *)table.first16_nbits)), data_for_lut);
+    auto nbits_hi16 = MM(shuffle_epi8)(
+        BCAST128(_mm_load_si128((__m128i *)table.last16_nbits)), data_for_lut);
+    auto nbits = MM(blendv_epi8)(nbits_low16, nbits_hi16, bytes);
+
+    auto bits_low16 = MM(shuffle_epi8)(
+        BCAST128(_mm_load_si128((__m128i *)table.first16_bits)), data_for_lut);
+    auto bits_hi16 = MM(shuffle_epi8)(
+        BCAST128(_mm_load_si128((__m128i *)table.last16_bits)), data_for_lut);
+    auto bits_lo = MM(blendv_epi8)(bits_low16, bits_hi16, bytes);
+
+    if (MM(movemask_epi8)(use_lowhi) ^ SIMD_MASK) {
+      auto data_for_midlut =
+          MMSI(and)(MM(set1_epi8)(0xF), MM(srai_epi16)(bytes, 4));
+
+      auto bits_mid_lo = MM(shuffle_epi8)(
+          BCAST128(_mm_load_si128((__m128i *)table.mid_lowbits)),
+          data_for_midlut);
+
+      auto bits_hi = MM(shuffle_epi8)(
+          BCAST128(_mm_load_si128((__m128i *)kBitReverseNibbleLookup)),
+          data_for_lut);
+
+      use_lowhi = MMSI(or)(use_lowhi, maskv);
+      nbits = MM(blendv_epi8)(MM(set1_epi8)(table.mid_nbits), nbits, use_lowhi);
+      bits_lo = MM(blendv_epi8)(bits_mid_lo, bits_lo, use_lowhi);
+
+#if !FPNGE_USE_PEXT
+      bits_hi = MMSI(andnot)(use_lowhi, bits_hi);
+#endif
+
+      WriteBitsLong(nbits, bits_lo, bits_hi, table.mid_nbits - 4, writer);
+    } else {
+      // since mid (symbols 16-239) is not present, we can take some shortcuts
+      // this is expected to occur frequently if compression is effective
+      WriteBitsShort(nbits, bits_lo, writer);
+    }
+  };
+
+  auto adler_chunk_cb = [&](const MIVEC pdata, size_t bytes_in_vec, size_t) {
+    bytes_since_flush += bytes_in_vec;
+    auto bytes = pdata;
+
+    auto muls = MM(set_epi8)(
+        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+#if SIMD_WIDTH == 32
+        ,
+        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+#endif
+    );
+
+    if (bytes_in_vec < SIMD_WIDTH) {
+      adler_accum_s2 = MM(add_epi32)(
+          MM(mul_epu32)(MM(set1_epi32)(bytes_in_vec), adler_accum_s1),
+          adler_accum_s2);
+      bytes =
+          MMSI(andnot)(MMSI(loadu)((MIVEC *)(kMaskVec - bytes_in_vec)), bytes);
+      muls = MM(add_epi8)(muls, MM(set1_epi8)(bytes_in_vec - SIMD_WIDTH));
+    } else {
+      adler_s1_sum = MM(add_epi32)(adler_s1_sum, adler_accum_s1);
+    }
+
+    adler_accum_s1 =
+        MM(add_epi32)(adler_accum_s1, MM(sad_epu8)(bytes, MMSI(setzero)()));
+
+    auto bytesmuls = MM(maddubs_epi16)(bytes, muls);
+    adler_accum_s2 = MM(add_epi32)(
+        adler_accum_s2, MM(madd_epi16)(bytesmuls, MM(set1_epi16)(1)));
+
+    if (bytes_since_flush >= 5500) {
+      flush_adler();
+    }
+  };
+
+  auto encode_rle_cb = [&](size_t run) {
+    writer->Write(table.first16_nbits[0], table.first16_bits[0]);
+    ForAllRLESymbols(run, [&](size_t len, size_t count) {
+      uint32_t bits = (table.dist_bits << table.lz77_length_nbits[len]) |
+                      table.lz77_length_bits[len];
+      auto nbits = table.lz77_length_nbits[len] + table.dist_nbits;
+      while (count--) {
+        writer->Write(nbits, bits);
+      }
+    });
+  };
+
+  if (options->predictor > 4 && predictor == 4) {
+    // re-use Paeth data
+    ProcessRow<0>(bytes_per_line, paeth_data, nullptr, nullptr, nullptr,
+                  encode_chunk_cb, adler_chunk_cb, encode_rle_cb);
+  } else {
+    ProcessRow(predictor, bytes_per_line, current_row_buf, top_buf, left_buf,
+               topleft_buf, encode_chunk_cb, adler_chunk_cb, encode_rle_cb);
+  }
+
+  flush_adler();
+}
+
+static void
+CollectSymbolCounts(size_t bytes_per_line, const unsigned char *current_row_buf,
+                    const unsigned char *top_buf, const unsigned char *left_buf,
+                    const unsigned char *topleft_buf, unsigned char *paeth_data,
+                    uint64_t *__restrict symbol_counts,
+                    const struct FPNGEOptions *options) {
+
+  auto encode_chunk_cb = [&](const MIVEC pdata, const size_t bytes_in_vec) {
+    alignas(SIMD_WIDTH) uint8_t predicted_data[SIMD_WIDTH];
+    MMSI(store)((MIVEC *)predicted_data, pdata);
+    for (size_t i = 0; i < bytes_in_vec; i++) {
+      symbol_counts[predicted_data[i]] += 1;
+    }
+  };
+
+  auto adler_chunk_cb = [&](const MIVEC, size_t, size_t) {};
+
+  auto encode_rle_cb = [&](size_t run) {
+    symbol_counts[0] += 1;
+    constexpr size_t kLZ77Sym[] = {
+        0,   0,   0,   257, 258, 259, 260, 261, 262, 263, 264, 265, 265, 266,
+        266, 267, 267, 268, 268, 269, 269, 269, 269, 270, 270, 270, 270, 271,
+        271, 271, 271, 272, 272, 272, 272, 273, 273, 273, 273, 273, 273, 273,
+        273, 274, 274, 274, 274, 274, 274, 274, 274, 275, 275, 275, 275, 275,
+        275, 275, 275, 276, 276, 276, 276, 276, 276, 276, 276, 277, 277, 277,
+        277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 278,
+        278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278,
+        278, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279, 279,
+        279, 279, 279, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280, 280,
+        280, 280, 280, 280, 280, 281, 281, 281, 281, 281, 281, 281, 281, 281,
+        281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281,
+        281, 281, 281, 281, 281, 281, 281, 281, 281, 282, 282, 282, 282, 282,
+        282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282,
+        282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 283,
+        283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283,
+        283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283,
+        283, 283, 283, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284,
+        284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284,
+        284, 284, 284, 284, 284, 284, 285,
+    };
+    ForAllRLESymbols(run, [&](size_t len, size_t count) {
+      symbol_counts[kLZ77Sym[len]] += count;
+    });
+  };
+
+  if (options->predictor == FPNGE_PREDICTOR_APPROX) {
+    // filter selection here seems to be slightly more effective when using the
+    // approximate selector; more investigation is probably warranted
+    HuffmanTable dummy_table;
+    uint8_t predictor =
+        SelectPredictor(bytes_per_line, current_row_buf, top_buf, left_buf,
+                        topleft_buf, paeth_data, dummy_table, options);
+    if (predictor == 4) {
+      ProcessRow<0>(bytes_per_line, paeth_data, nullptr, nullptr, nullptr,
+                    encode_chunk_cb, adler_chunk_cb, encode_rle_cb);
+    } else {
+      ProcessRow(predictor, bytes_per_line, current_row_buf, top_buf, left_buf,
+                 topleft_buf, encode_chunk_cb, adler_chunk_cb, encode_rle_cb);
+    }
+  } else {
+    uint8_t predictor = options->predictor > 4 ? 4 : options->predictor;
+    ProcessRow(predictor, bytes_per_line, current_row_buf, top_buf, left_buf,
+               topleft_buf, encode_chunk_cb, adler_chunk_cb, encode_rle_cb);
+  }
+}
+
+static void AppendBE32(size_t value, BitWriter *__restrict writer) {
+  writer->Write(8, value >> 24);
+  writer->Write(8, (value >> 16) & 0xFF);
+  writer->Write(8, (value >> 8) & 0xFF);
+  writer->Write(8, value & 0xFF);
+}
+
+static void WriteHeader(size_t width, size_t height, size_t bytes_per_channel,
+                        size_t num_channels, char cicp_colorspace,
+                        const FPNGEAdditionalChunk *additional_chunks,
+                        int num_additional_chunks,
+                        BitWriter *__restrict writer) {
+  constexpr uint8_t kPNGHeader[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+  for (size_t i = 0; i < 8; i++) {
+    writer->Write(8, kPNGHeader[i]);
+  }
+  // Length
+  writer->Write(32, 0x0d000000);
+  assert(writer->bits_in_buffer == 0);
+  size_t crc_start = writer->bytes_written;
+  // IHDR
+  writer->Write(32, 0x52444849);
+  AppendBE32(width, writer);
+  AppendBE32(height, writer);
+  // Bit depth
+  writer->Write(8, bytes_per_channel * 8);
+  // Colour type
+  constexpr uint8_t numc_to_colour_type[] = {0, 0, 4, 2, 6};
+  writer->Write(8, numc_to_colour_type[num_channels]);
+  // Compression, filter and interlace methods.
+  writer->Write(24, 0);
+  assert(writer->bits_in_buffer == 0);
+  size_t crc_end = writer->bytes_written;
+  uint32_t crc =
+      Crc32().update_final(writer->data + crc_start, crc_end - crc_start);
+  AppendBE32(crc, writer);
+
+  if (cicp_colorspace == FPNGE_CICP_PQ) {
+    writer->Write(32, 0x04000000);
+    writer->Write(32, 0x50434963); // cICP
+    writer->Write(32, 0x01001009); // PQ, Rec2020
+    writer->Write(32, 0xfe23234d); // CRC
+  }
+  for (int i = 0; i < num_additional_chunks; i++) {
+    AppendBE32(additional_chunks[i].data_size, writer);
+    size_t crc_start = writer->bytes_written;
+    uint32_t name = 0;
+    memcpy(&name, additional_chunks[i].name, 4);
+    writer->Write(32, name);
+    writer->WriteBytes(additional_chunks[i].data,
+                       additional_chunks[i].data_size);
+    size_t crc_end = writer->bytes_written;
+    uint32_t crc =
+        Crc32().update_final(writer->data + crc_start, crc_end - crc_start);
+    AppendBE32(crc, writer);
+  }
+}
+
+void CopyRow(unsigned char *dst, const unsigned char *src, size_t nb_channels,
+             size_t bytes_per_channel, FPNGEColorChannelOrder order,
+             size_t width) {
+  if (order == FPNGE_ORDER_RGB || nb_channels <= 2) {
+    memcpy(dst, src, nb_channels * bytes_per_channel * width);
+    return;
+  }
+  size_t x = 0;
+  if (nb_channels == 4 && bytes_per_channel == 1) {
+    for (; x + SIMD_WIDTH / 4 <= width; x += SIMD_WIDTH / 4) {
+      auto vec = MMSI(loadu)((MIVEC *)(src + x * 4));
+      auto shuf = MM(shuffle_epi8)(
+          vec, BCAST128(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14,
+                                      13, 12, 15)));
+      MMSI(storeu)((MIVEC *)(dst + x * 4), shuf);
+    }
+  } else if (nb_channels == 4 && bytes_per_channel == 2) {
+    for (; x + SIMD_WIDTH / 8 <= width; x += SIMD_WIDTH / 8) {
+      auto vec = MMSI(loadu)((MIVEC *)(src + x * 8));
+      auto shuf = MM(shuffle_epi8)(
+          vec, BCAST128(_mm_setr_epi8(4, 5, 2, 3, 0, 1, 6, 7, 12, 13, 10, 11, 8,
+                                      9, 14, 15)));
+      MMSI(storeu)((MIVEC *)(dst + x * 8), shuf);
+    }
+  } else if (nb_channels == 3 && bytes_per_channel == 1) {
+    for (; x + 16 <= width; x += 16) {
+      auto vec1 = _mm_loadu_si128((__m128i *)(src + x * 3));
+      auto vec2 = _mm_loadu_si128((__m128i *)(src + x * 3 + 16));
+      auto vec3 = _mm_loadu_si128((__m128i *)(src + x * 3 + 32));
+      auto s1a =
+          _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, -1);
+      auto s1b = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                               -1, -1, -1, 1);
+      auto shuf1 = _mm_or_si128(_mm_shuffle_epi8(vec1, s1a),
+                                _mm_shuffle_epi8(vec2, s1b));
+      auto s2a = _mm_setr_epi8(-1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                               -1, -1, -1, -1);
+      auto s2b =
+          _mm_setr_epi8(0, -1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, -1, 15);
+      auto s2c = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                               -1, -1, 0, -1);
+      auto shuf2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(vec1, s2a),
+                                             _mm_shuffle_epi8(vec2, s2b)),
+                                _mm_shuffle_epi8(vec3, s2c));
+      auto s3b = _mm_setr_epi8(14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                               -1, -1, -1, -1);
+      auto s3c =
+          _mm_setr_epi8(-1, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13);
+      auto shuf3 = _mm_or_si128(_mm_shuffle_epi8(vec2, s3b),
+                                _mm_shuffle_epi8(vec3, s3c));
+
+      _mm_storeu_si128((__m128i *)(dst + x * 3), shuf1);
+      _mm_storeu_si128((__m128i *)(dst + x * 3 + 16), shuf2);
+      _mm_storeu_si128((__m128i *)(dst + x * 3 + 32), shuf3);
+    }
+  } else if (nb_channels == 3 && bytes_per_channel == 2) {
+    for (; x + 8 <= width; x += 8) {
+      auto vec1 = _mm_loadu_si128((__m128i *)(src + x * 6));
+      auto vec2 = _mm_loadu_si128((__m128i *)(src + x * 6 + 16));
+      auto vec3 = _mm_loadu_si128((__m128i *)(src + x * 6 + 32));
+      auto s1a =
+          _mm_setr_epi8(4, 5, 2, 3, 0, 1, 10, 11, 8, 9, 6, 7, -1, -1, 14, 15);
+      auto s1b = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                               0, 1, -1, -1);
+      auto shuf1 = _mm_or_si128(_mm_shuffle_epi8(vec1, s1a),
+                                _mm_shuffle_epi8(vec2, s1b));
+      auto s2a = _mm_setr_epi8(12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                               -1, -1, -1, -1);
+      auto s2b =
+          _mm_setr_epi8(-1, -1, 6, 7, 4, 5, 2, 3, 12, 13, 10, 11, 8, 9, -1, -1);
+      auto s2c = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                               -1, -1, 2, 3);
+      auto shuf2 = _mm_or_si128(_mm_or_si128(_mm_shuffle_epi8(vec1, s2a),
+                                             _mm_shuffle_epi8(vec2, s2b)),
+                                _mm_shuffle_epi8(vec3, s2c));
+      auto s3b = _mm_setr_epi8(-1, -1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
+                               -1, -1, -1, -1);
+      auto s3c =
+          _mm_setr_epi8(0, 1, -1, -1, 8, 9, 6, 7, 4, 5, 14, 15, 12, 13, 10, 11);
+      auto shuf3 = _mm_or_si128(_mm_shuffle_epi8(vec2, s3b),
+                                _mm_shuffle_epi8(vec3, s3c));
+
+      _mm_storeu_si128((__m128i *)(dst + x * 6), shuf1);
+      _mm_storeu_si128((__m128i *)(dst + x * 6 + 16), shuf2);
+      _mm_storeu_si128((__m128i *)(dst + x * 6 + 32), shuf3);
+    }
+  }
+  for (; x < width; x++) {
+    if (nb_channels == 3 && bytes_per_channel == 1) {
+      dst[x * 3] = src[x * 3 + 2];
+      dst[x * 3 + 1] = src[x * 3 + 1];
+      dst[x * 3 + 2] = src[x * 3];
+    }
+    if (nb_channels == 3 && bytes_per_channel == 2) {
+      dst[x * 6] = src[x * 6 + 4];
+      dst[x * 6 + 1] = src[x * 6 + 3];
+      dst[x * 6 + 2] = src[x * 6 + 2];
+      dst[x * 6 + 3] = src[x * 6 + 3];
+      dst[x * 6 + 4] = src[x * 6];
+      dst[x * 6 + 5] = src[x * 6 + 1];
+    }
+    if (nb_channels == 4 && bytes_per_channel == 1) {
+      dst[x * 4] = src[x * 4 + 2];
+      dst[x * 4 + 1] = src[x * 4 + 1];
+      dst[x * 4 + 2] = src[x * 4];
+      dst[x * 4 + 3] = src[x * 4 + 3];
+    }
+    if (nb_channels == 4 && bytes_per_channel == 2) {
+      dst[x * 8] = src[x * 8 + 4];
+      dst[x * 8 + 1] = src[x * 8 + 3];
+      dst[x * 8 + 2] = src[x * 8 + 2];
+      dst[x * 8 + 3] = src[x * 8 + 3];
+      dst[x * 8 + 4] = src[x * 8];
+      dst[x * 8 + 5] = src[x * 8 + 1];
+      dst[x * 8 + 6] = src[x * 8 + 6];
+      dst[x * 8 + 7] = src[x * 8 + 7];
+    }
+  }
+}
+
+} // namespace
+
+extern "C" size_t FPNGEEncode(size_t bytes_per_channel, size_t num_channels,
+                              const void *data, size_t width, size_t row_stride,
+                              size_t height, void *output,
+                              const struct FPNGEOptions *options) {
+  assert(bytes_per_channel == 1 || bytes_per_channel == 2);
+  assert(num_channels != 0 && num_channels <= 4);
+  size_t bytes_per_line = bytes_per_channel * num_channels * width;
+  assert(row_stride >= bytes_per_line);
+
+  // allows for padding, and for extra initial space for the "left" pixel for
+  // predictors.
+  size_t bytes_per_line_buf =
+      (bytes_per_line + 4 * bytes_per_channel + SIMD_WIDTH - 1) / SIMD_WIDTH *
+      SIMD_WIDTH;
+
+  // Extra space for alignment purposes.
+  std::vector<unsigned char> buf(bytes_per_line_buf * 2 + SIMD_WIDTH - 1 +
+                                 4 * bytes_per_channel);
+  unsigned char *aligned_buf_ptr = buf.data() + 4 * bytes_per_channel;
+  aligned_buf_ptr += (intptr_t)aligned_buf_ptr % SIMD_WIDTH
+                         ? (SIMD_WIDTH - (intptr_t)aligned_buf_ptr % SIMD_WIDTH)
+                         : 0;
+
+  std::vector<unsigned char> pdata_buf(bytes_per_line_buf + SIMD_WIDTH - 1);
+  unsigned char *aligned_pdata_ptr = pdata_buf.data();
+  aligned_pdata_ptr +=
+      (intptr_t)aligned_pdata_ptr % SIMD_WIDTH
+          ? (SIMD_WIDTH - (intptr_t)aligned_pdata_ptr % SIMD_WIDTH)
+          : 0;
+
+  struct FPNGEOptions default_options;
+  if (options == nullptr) {
+    FPNGEFillOptions(&default_options, FPNGE_COMPRESS_LEVEL_DEFAULT,
+                     FPNGE_CICP_NONE);
+    options = &default_options;
+  }
+
+  // options sanity check
+  assert(options->predictor >= 0 && options->predictor <= FPNGE_PREDICTOR_BEST);
+  assert(options->huffman_sample >= 0 && options->huffman_sample <= 127);
+
+  BitWriter writer;
+  writer.data = static_cast<unsigned char *>(output);
+
+  WriteHeader(width, height, bytes_per_channel, num_channels,
+              options->cicp_colorspace, options->additional_chunks,
+              options->num_additional_chunks, &writer);
+
+  assert(writer.bits_in_buffer == 0);
+  size_t chunk_length_pos = writer.bytes_written;
+  writer.bytes_written += 4; // Skip space for length.
+  size_t crc_pos = writer.bytes_written;
+  writer.Write(32, 0x54414449); // IDAT
+  // Deflate header
+  writer.Write(8, 8);  // deflate with smallest window
+  writer.Write(8, 29); // cfm+flg check value
+
+  uint64_t symbol_counts[286] = {};
+
+  // Sample rows in the center of the image.
+  size_t y0 = height * (127 - options->huffman_sample) / 256;
+  size_t y1 = height * (129 + options->huffman_sample) / 256;
+  if (y1 == 0 && height > 0) { // for 1 pixel high images
+    y1 = 1;
+  }
+
+  for (size_t y = y0; y < y1; y++) {
+    const unsigned char *current_row_in =
+        static_cast<const unsigned char *>(data) + row_stride * y;
+    unsigned char *current_row_buf =
+        aligned_buf_ptr + (y % 2 ? bytes_per_line_buf : 0);
+    const unsigned char *top_buf =
+        aligned_buf_ptr + ((y + 1) % 2 ? bytes_per_line_buf : 0);
+    const unsigned char *left_buf =
+        current_row_buf - bytes_per_channel * num_channels;
+    const unsigned char *topleft_buf =
+        top_buf - bytes_per_channel * num_channels;
+
+    CopyRow(current_row_buf, current_row_in, num_channels, bytes_per_channel,
+            (FPNGEColorChannelOrder)options->channel_order, width);
+    if (y == y0 && y != 0) {
+      continue;
+    }
+
+    CollectSymbolCounts(bytes_per_line, current_row_buf, top_buf, left_buf,
+                        topleft_buf, aligned_pdata_ptr, symbol_counts, options);
+  }
+
+  memset(buf.data(), 0, buf.size());
+
+  HuffmanTable huffman_table(symbol_counts);
+
+  // Single block, dynamic huffman
+  writer.Write(3, 0b101);
+  WriteHuffmanCode(huffman_table, &writer);
+
+  Crc32 crc;
+  uint32_t s1 = 1;
+  uint32_t s2 = 0;
+  for (size_t y = 0; y < height; y++) {
+    const unsigned char *current_row_in =
+        static_cast<const unsigned char *>(data) + row_stride * y;
+    unsigned char *current_row_buf =
+        aligned_buf_ptr + (y % 2 ? bytes_per_line_buf : 0);
+    const unsigned char *top_buf =
+        aligned_buf_ptr + ((y + 1) % 2 ? bytes_per_line_buf : 0);
+    const unsigned char *left_buf =
+        current_row_buf - bytes_per_channel * num_channels;
+    const unsigned char *topleft_buf =
+        top_buf - bytes_per_channel * num_channels;
+
+    CopyRow(current_row_buf, current_row_in, num_channels, bytes_per_channel,
+            (FPNGEColorChannelOrder)options->channel_order, width);
+
+    EncodeOneRow(bytes_per_line, current_row_buf, top_buf, left_buf,
+                 topleft_buf, aligned_pdata_ptr, huffman_table, s1, s2, &writer,
+                 options);
+
+    crc_pos +=
+        crc.update(writer.data + crc_pos, writer.bytes_written - crc_pos);
+  }
+
+  // EOB
+  writer.Write(huffman_table.nbits[256], huffman_table.end_bits);
+
+  writer.ZeroPadToByte();
+  assert(writer.bits_in_buffer == 0);
+  s1 %= kAdler32Mod;
+  s2 %= kAdler32Mod;
+  uint32_t adler32 = (s2 << 16) | s1;
+  AppendBE32(adler32, &writer);
+
+  size_t data_len = writer.bytes_written - chunk_length_pos - 8;
+  writer.data[chunk_length_pos + 0] = data_len >> 24;
+  writer.data[chunk_length_pos + 1] = (data_len >> 16) & 0xFF;
+  writer.data[chunk_length_pos + 2] = (data_len >> 8) & 0xFF;
+  writer.data[chunk_length_pos + 3] = data_len & 0xFF;
+
+  auto final_crc =
+      crc.update_final(writer.data + crc_pos, writer.bytes_written - crc_pos);
+  AppendBE32(final_crc, &writer);
+
+  // IEND
+  writer.Write(32, 0);
+  writer.Write(32, 0x444e4549);
+  writer.Write(32, 0x826042ae);
+
+  return writer.bytes_written;
+}
diff --git a/ouster_osf/src/fpnge.h b/ouster_osf/src/fpnge.h
new file mode 100644
index 00000000..77db8a76
--- /dev/null
+++ b/ouster_osf/src/fpnge.h
@@ -0,0 +1,102 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef FPNGE_H
+#define FPNGE_H
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum FPNGECicpColorspace { FPNGE_CICP_NONE, FPNGE_CICP_PQ };
+
+enum FPNGEOptionsPredictor {
+  FPNGE_PREDICTOR_FIXED_NOOP,
+  FPNGE_PREDICTOR_FIXED_SUB,
+  FPNGE_PREDICTOR_FIXED_TOP,
+  FPNGE_PREDICTOR_FIXED_AVG,
+  FPNGE_PREDICTOR_FIXED_PAETH,
+  FPNGE_PREDICTOR_APPROX,
+  FPNGE_PREDICTOR_BEST
+};
+
+enum FPNGEColorChannelOrder {
+  FPNGE_ORDER_RGB,
+  FPNGE_ORDER_BGR,
+};
+
+struct FPNGEAdditionalChunk {
+  char name[4];
+  const char *data;
+  int data_size;
+};
+
+struct FPNGEOptions {
+  char predictor;       // FPNGEOptionsPredictor
+  char huffman_sample;  // 0-127: how much of the image to sample
+  char cicp_colorspace; // FPNGECicpColorspace
+  char channel_order;   // FPNGEColorChannelOrder
+  int num_additional_chunks;
+  const struct FPNGEAdditionalChunk *additional_chunks;
+};
+
+#define FPNGE_COMPRESS_LEVEL_DEFAULT 4
+#define FPNGE_COMPRESS_LEVEL_BEST 5
+inline void FPNGEFillOptions(struct FPNGEOptions *options, int level,
+                             int cicp_colorspace) {
+  if (level == 0)
+    level = FPNGE_COMPRESS_LEVEL_DEFAULT;
+  options->cicp_colorspace = cicp_colorspace;
+  options->huffman_sample = 1;
+  options->num_additional_chunks = 0;
+  options->additional_chunks = NULL;
+  options->channel_order = FPNGE_ORDER_RGB;
+  switch (level) {
+  case 1:
+    options->predictor = 2;
+    break;
+  case 2:
+    options->predictor = 4;
+    break;
+  case 3:
+    options->predictor = 5;
+    break;
+  case 5:
+    options->huffman_sample = 23;
+    // fall through
+  default:
+    options->predictor = 6;
+    break;
+  }
+}
+
+// bytes_per_channel = 1/2 for 8-bit and 16-bit. num_channels: 1/2/3/4
+// (G/GA/RGB/RGBA)
+size_t FPNGEEncode(size_t bytes_per_channel, size_t num_channels,
+                   const void *data, size_t width, size_t row_stride,
+                   size_t height, void *output,
+                   const struct FPNGEOptions *options);
+
+inline size_t FPNGEOutputAllocSize(size_t bytes_per_channel,
+                                   size_t num_channels, size_t width,
+                                   size_t height) {
+  // likely an overestimate
+  return 1024 + (2 * bytes_per_channel * width * num_channels + 1) * height;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ouster_osf/src/png_lidarscan_encoder.cpp b/ouster_osf/src/png_lidarscan_encoder.cpp
index 7374cf91..2f1309c2 100644
--- a/ouster_osf/src/png_lidarscan_encoder.cpp
+++ b/ouster_osf/src/png_lidarscan_encoder.cpp
@@ -9,11 +9,26 @@
 
 #include "ouster/osf/png_lidarscan_encoder.h"
 
+#define QOI_IMPLEMENTATION
+#include "qoi.h"
+
 #include <png.h>
 
 #include "ouster/impl/logging.h"
 #include "png_tools.h"
 
+#include <lz4.h>
+
+#include <zlib.h>
+
+#define QOIR_IMPLEMENTATION
+//#include "qoir.h"
+
+#include "fpnge.h"
+#include "fpnge.cc"
+
+#include "zpng.h"
+
 /* Check for the older version of libpng and add missing macros as necessary */
 
 #if (PNG_LIBPNG_VER_MAJOR == 1)
@@ -43,6 +58,157 @@ bool PngLidarScanEncoder::fieldEncode(const LidarScan& lidar_scan,
             "ERROR: scan_data size is not sufficient to hold idx: " +
             std::to_string(scan_idx));
     }
+    
+    /*FPNGEOptions options;
+    options.channel_order = 0;
+    options.huffman_sample = 10;
+    options.cicp_colorspace = 0;
+    options.num_additional_chunks = 0;
+    options.predictor = FPNGE_PREDICTOR_BEST;
+    FPNGEFillOptions(&options, 5, 0);
+    
+    auto field = lidar_scan.field(field_type.name);
+    auto data = field.get();
+    
+    int channels = 1;
+    int bytewidth = 1;
+    switch (field_type.element_type) {
+        case sensor::ChanFieldType::UINT8:
+            break;
+        case sensor::ChanFieldType::UINT16:
+            bytewidth = 2;
+            break;
+        case sensor::ChanFieldType::UINT32:
+            channels = 4;
+            break;
+        case sensor::ChanFieldType::UINT64:
+            break;
+    }
+    int stride = lidar_scan.w * channels * bytewidth;
+    auto buf_size = FPNGEOutputAllocSize(bytewidth, channels, lidar_scan.w, lidar_scan.h);
+    auto output = malloc(buf_size);
+    auto len = FPNGEEncode(bytewidth, channels, data, lidar_scan.w, stride, lidar_scan.h, output, &options);
+    scan_data[scan_idx].resize(len);
+    memcpy(scan_data[scan_idx].data(), output, len);
+    free(output);
+    return false;*/
+    
+    ZPNG_ImageData opts;
+    opts.HeightPixels = lidar_scan.h;
+    opts.WidthPixels = lidar_scan.w;
+    
+    auto field = lidar_scan.field(field_type.name);
+    auto data = field.get();
+    
+    opts.Buffer.Data = (unsigned char*)data;
+    opts.Buffer.Bytes = field.bytes();
+    
+    int channels = 1;
+    int bytewidth = 1;
+    switch (field_type.element_type) {
+        case sensor::ChanFieldType::UINT8:
+            break;
+        case sensor::ChanFieldType::UINT16:
+            bytewidth = 2;
+            break;
+        case sensor::ChanFieldType::UINT32:
+            channels = 4;
+            break;
+        case sensor::ChanFieldType::UINT64:
+            break;
+    }
+    
+    opts.BytesPerChannel = bytewidth;
+    opts.Channels = channels;
+    int stride = lidar_scan.w * channels * bytewidth;
+    opts.StrideBytes = stride;
+    
+    auto out = ZPNG_Compress(&opts);
+    
+    auto len = out.Bytes;
+    scan_data[scan_idx].resize(len);
+    memcpy(scan_data[scan_idx].data(), out.Data, len);
+    ZPNG_Free(&out);
+    return false;
+    
+    /*qoi_desc desc;
+    desc.width = lidar_scan.w/4;
+    desc.height = lidar_scan.h;
+    desc.channels = 4;
+    desc.colorspace = 0;
+    
+    switch (field_type.element_type) {
+        case sensor::ChanFieldType::UINT8:
+            desc.width *= 4;
+            desc.channels = 1;
+            break;
+        case sensor::ChanFieldType::UINT16:
+            desc.height *= 2;
+            break;
+        case sensor::ChanFieldType::UINT32:
+            desc.height *= 4;
+            break;
+        case sensor::ChanFieldType::UINT64:
+            desc.height *= 8;
+            break;
+    }
+    
+    int qlen;
+    void* result = qoi_encode(lidar_scan.field(field_type.name).get(), &desc, &qlen);
+    
+    auto field = lidar_scan.field(field_type.name);
+    auto bytes = qlen;
+    auto len = LZ4_compressBound(bytes);
+    scan_data[scan_idx].resize(len);
+    auto real_len = LZ4_compress_default((char*)result, (char*)scan_data[scan_idx].data(), bytes, len);
+    scan_data[scan_idx].resize(real_len);
+    free(result);*/
+    
+    /*qoi_desc desc;
+    desc.width = lidar_scan.w/4;
+    desc.height = lidar_scan.h;
+    desc.channels = 4;
+    desc.colorspace = 0;
+    
+    switch (field_type.element_type) {
+        case sensor::ChanFieldType::UINT8:
+            desc.width *= 4;
+            desc.channels = 1;
+            break;
+        case sensor::ChanFieldType::UINT16:
+            desc.height *= 2;
+            break;
+        case sensor::ChanFieldType::UINT32:
+            desc.height *= 4;
+            break;
+        case sensor::ChanFieldType::UINT64:
+            desc.height *= 8;
+            break;
+    }
+    
+    int len;
+    void* result = qoi_encode(lidar_scan.field(field_type.name).get(), &desc, &len);
+    bool res = true;
+    scan_data[scan_idx].resize(len);
+    memcpy(scan_data[scan_idx].data(), result, len);
+    free(result);*/
+    
+    /*auto field = lidar_scan.field(field_type.name);
+    auto bytes = field.bytes();
+    auto len = LZ4_compressBound(bytes);
+    scan_data[scan_idx].resize(len);
+    auto real_len = LZ4_compress_default((char*)field.get(), (char*)scan_data[scan_idx].data(), bytes, len);
+    scan_data[scan_idx].resize(real_len);*/
+    
+    /*auto field = lidar_scan.field(field_type.name);
+    auto bytes = field.bytes();
+    auto len = compressBound(bytes);
+    scan_data[scan_idx].resize(len);
+    auto real_len = len;
+    compress((unsigned char*)scan_data[scan_idx].data(), &real_len, (unsigned char*)field.get(), bytes);
+    scan_data[scan_idx].resize(real_len);*/
+    //printf("%li vs %li %f\n", bytes, real_len, (float)real_len/(float)bytes);
+    
     bool res = true;
     switch (field_type.element_type) {
         case sensor::ChanFieldType::UINT8:
@@ -76,7 +242,7 @@ bool PngLidarScanEncoder::fieldEncode(const LidarScan& lidar_scan,
         logger().error("ERROR: fieldEncode: Can't encode field {}",
                        field_type.name);
     }
-    return res;
+    return false;
 }
 
 ScanChannelData PngLidarScanEncoder::encodeField(
@@ -141,6 +307,28 @@ bool PngLidarScanEncoder::encode8bitImage(
     ScanChannelData& res_buf, const Eigen::Ref<const img_t<T>>& img) const {
     const uint32_t width = static_cast<uint32_t>(img.cols());
     const uint32_t height = static_cast<uint32_t>(img.rows());
+    
+    FPNGEOptions options;
+    options.channel_order = 0;
+    options.huffman_sample = 10;
+    options.cicp_colorspace = 0;
+    options.num_additional_chunks = 0;
+    options.predictor = FPNGE_PREDICTOR_BEST;
+    FPNGEFillOptions(&options, 5, 0);
+    
+    auto data = img.data();
+    
+    int channels = 1;
+    int bytewidth = 1;
+    int stride = width * channels * bytewidth;
+    auto buf_size = FPNGEOutputAllocSize(bytewidth, channels, width, height);
+    auto output = malloc(buf_size);
+    auto len = FPNGEEncode(bytewidth, channels, data, width, stride, height, output, &options);
+    res_buf.resize(len);
+    memcpy(res_buf.data(), output, len);
+    free(output);
+    
+    return false;
 
     // 8 bit Gray
     const int sample_depth = 8;
@@ -188,6 +376,36 @@ bool PngLidarScanEncoder::encode16bitImage(
     const uint32_t width = static_cast<uint32_t>(img.cols());
     const uint32_t height = static_cast<uint32_t>(img.rows());
 
+    FPNGEOptions options;
+    options.channel_order = 0;
+    options.huffman_sample = 10;
+    options.cicp_colorspace = 0;
+    options.num_additional_chunks = 0;
+    options.predictor = FPNGE_PREDICTOR_BEST;
+    FPNGEFillOptions(&options, 5, 0);
+    
+    std::vector<uint8_t> data2(height*width*2);
+    for (size_t u = 0; u < height; ++u) {
+        for (size_t v = 0; v < width; ++v) {
+            const uint64_t key_val = img(u, v);
+
+            // 16bit Encoding Logic
+            data2[u*width*2 + v * 2 + 1] = static_cast<uint8_t>(key_val & 0xff);
+            data2[u*width*2 + v * 2 + 0] = static_cast<uint8_t>((key_val >> 8u) & 0xff);
+        }
+    }
+    
+    int channels = 1;
+    int bytewidth = 2;
+    int stride = width * channels * bytewidth;
+    auto buf_size = FPNGEOutputAllocSize(bytewidth, channels, width, height);
+    res_buf.resize(buf_size);
+    auto output = res_buf.data();
+    auto len = FPNGEEncode(bytewidth, channels, data2.data(), width, stride, height, output, &options);
+    res_buf.resize(len);
+    
+    return false;
+    
     // 16 bit Gray
     const int sample_depth = 16;
     const int color_type = PNG_COLOR_TYPE_GRAY;
@@ -312,6 +530,28 @@ bool PngLidarScanEncoder::encode32bitImage(
     const uint32_t width = static_cast<uint32_t>(img.cols());
     const uint32_t height = static_cast<uint32_t>(img.rows());
 
+    FPNGEOptions options;
+    options.channel_order = 0;
+    options.huffman_sample = 10;
+    options.cicp_colorspace = 0;
+    options.num_additional_chunks = 0;
+    options.predictor = FPNGE_PREDICTOR_BEST;
+    FPNGEFillOptions(&options, 5, 0);
+    
+    auto data = img.data();
+    
+    int channels = 4;
+    int bytewidth = 1;
+    int stride = width * channels * bytewidth;
+    auto buf_size = FPNGEOutputAllocSize(bytewidth, channels, width, height);
+    auto output = malloc(buf_size);
+    auto len = FPNGEEncode(bytewidth, channels, data, width, stride, height, output, &options);
+    res_buf.resize(len);
+    memcpy(res_buf.data(), output, len);
+    free(output);
+    
+    return false;
+    
     // 8bit RGBA
     const int sample_depth = 8;
     const int color_type = PNG_COLOR_TYPE_RGB_ALPHA;
diff --git a/ouster_osf/src/png_tools.cpp b/ouster_osf/src/png_tools.cpp
index 4999394d..07dbe420 100644
--- a/ouster_osf/src/png_tools.cpp
+++ b/ouster_osf/src/png_tools.cpp
@@ -11,9 +11,13 @@
 #include <iostream>
 #include <memory>
 
+#include "qoi.h"
+
 #include "ouster/impl/logging.h"
 #include "ouster/lidar_scan.h"
 
+#include "zpng.h"
+
 using namespace ouster::sensor;
 
 namespace ouster {
@@ -179,6 +183,19 @@ void png_osf_write_start(png_structp png_ptr, png_infop png_info_ptr,
 bool fieldDecode(LidarScan& lidar_scan, const ScanData& scan_data,
                  size_t start_idx, const ouster::FieldType& field_type,
                  const std::vector<int>& px_offset) {
+    ZPNG_Buffer buffer;
+    buffer.Bytes = scan_data[start_idx].size();
+    buffer.Data = (unsigned char*)scan_data[start_idx].data();
+    auto out = ZPNG_Decompress(buffer);
+    
+    if (out.Buffer.Data) {
+        // we can avoid a copy here if we want to microoptimize by changing zpng decompress
+        auto& field = lidar_scan.field(field_type.name);
+        memcpy(field.get(), out.Buffer.Data, out.Buffer.Bytes);
+        ZPNG_Free(&out.Buffer);
+        return false;
+    }
+    
     switch (field_type.element_type) {
         case sensor::ChanFieldType::UINT8:
             return decode8bitImage(lidar_scan.field<uint8_t>(field_type.name),
diff --git a/ouster_osf/src/qoi.h b/ouster_osf/src/qoi.h
new file mode 100644
index 00000000..bdf7a0f7
--- /dev/null
+++ b/ouster_osf/src/qoi.h
@@ -0,0 +1,1079 @@
+/*
+
+QOI - The "Quite OK Image" format for fast, lossless image compression
+
+Dominic Szablewski - https://phoboslab.org
+
+Greyscale Additions - Jay Stavnitzky
+
+-- LICENSE: The MIT License(MIT)
+
+Copyright(c) 2021 Dominic Szablewski
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files(the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and / or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions :
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+-- About
+
+QOI encodes and decodes images in a lossless format. Compared to stb_image and
+stb_image_write QOI offers 20x-50x faster encoding, 3x-4x faster decoding and
+20% better compression.
+
+
+-- Synopsis
+
+// Define `QOI_IMPLEMENTATION` in *one* C/C++ file before including this
+// library to create the implementation.
+
+#define QOI_IMPLEMENTATION
+#include "qoi.h"
+
+// Encode and store an RGBA buffer to the file system. The qoi_desc describes
+// the input pixel data.
+qoi_write("image_new.qoi", rgba_pixels, &(qoi_desc){
+	.width = 1920,
+	.height = 1080,
+	.channels = 4,
+	.colorspace = QOI_SRGB
+});
+
+// Load and decode a QOI image from the file system into a 32bbp RGBA buffer.
+// The qoi_desc struct will be filled with the width, height, number of channels
+// and colorspace read from the file header.
+qoi_desc desc;
+void *rgba_pixels = qoi_read("image.qoi", &desc, 4);
+
+
+
+-- Documentation
+
+This library provides the following functions;
+- qoi_read    -- read and decode a QOI file
+- qoi_decode  -- decode the raw bytes of a QOI image from memory
+- qoi_write   -- encode and write a QOI file
+- qoi_encode  -- encode an rgba buffer into a QOI image in memory
+
+See the function declaration below for the signature and more information.
+
+If you don't want/need the qoi_read and qoi_write functions, you can define
+QOI_NO_STDIO before including this library.
+
+This library uses malloc() and free(). To supply your own malloc implementation
+you can define QOI_MALLOC and QOI_FREE before including this library.
+
+This library uses memset() to zero-initialize the index. To supply your own
+implementation you can define QOI_ZEROARR before including this library.
+
+
+-- Data Format
+
+A QOI file has a 14 byte header, followed by any number of data "chunks" and an
+8-byte end marker.
+
+struct qoi_header_t {
+	char     magic[4];   // magic bytes "qoif"
+	uint32_t width;      // image width in pixels (BE)
+	uint32_t height;     // image height in pixels (BE)
+	uint8_t  channels;   // 3 = RGB, 4 = RGBA, 1=GREY
+	uint8_t  colorspace; // 0 = sRGB with linear alpha, 1 = all channels linear
+};
+
+Images are encoded row by row, left to right, top to bottom. The decoder and
+encoder start with {r: 0, g: 0, b: 0, a: 255} as the previous pixel value. An
+image is complete when all pixels specified by width * height have been covered.
+
+COLOUR PIXEL FORMATS 3 or 4 bytes per pixel format
+
+Colour pixels are encoded as
+ - a run of the previous pixel
+ - an index into an array of previously seen pixels
+ - a difference to the previous pixel value in r,g,b
+ - full r,g,b or r,g,b,a values
+
+The color channels are assumed to not be premultiplied with the alpha channel
+("un-premultiplied alpha").
+
+A running array[64] (zero-initialized) of previously seen pixel values is
+maintained by the encoder and decoder. Each pixel that is seen by the encoder
+and decoder is put into this array at the position formed by a hash function of
+the color value. In the encoder, if the pixel value at the index matches the
+current pixel, this index position is written to the stream as QOI_OP_INDEX.
+The hash function for the index is:
+
+	index_position = (r * 3 + g * 5 + b * 7 + a * 11) % 64
+
+Each chunk starts with a 2- or 8-bit tag, followed by a number of data bits. The
+bit length of chunks is divisible by 8 - i.e. all chunks are byte aligned. All
+values encoded in these data bits have the most significant bit on the left.
+
+The 8-bit tags have precedence over the 2-bit tags. A decoder must check for the
+presence of an 8-bit tag first.
+
+The byte stream's end is marked with 7 0x00 bytes followed a single 0x01 byte.
+
+
+The possible chunks are:
+
+.- QOI_OP_INDEX ----------.    
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----------------|
+|  0  0 |     index       |
+`-------------------------`
+2-bit tag b00
+6-bit index into the color index array: 0..63
+
+A valid encoder must not issue 2 or more consecutive QOI_OP_INDEX chunks to the
+same index. QOI_OP_RUN should be used instead.
+
+
+.- QOI_OP_DIFF -----------.		
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----+-----+-----|
+|  0  1 |  dr |  dg |  db |
+`-------------------------`
+2-bit tag b01
+2-bit   red channel difference from the previous pixel between -2..1
+2-bit green channel difference from the previous pixel between -2..1
+2-bit  blue channel difference from the previous pixel between -2..1
+
+Values are stored as unsigned integers with a bias of 2. E.g. -2 is stored as
+0 (b00). 1 is stored as 3 (b11).
+
+The difference to the current channel values are using a wraparound operation,
+so "1 - 2" will result in 255, while "255 + 1" will result in 0.
+
+The alpha value remains unchanged from the previous pixel.
+
+.- QOI_OP_LUMA -------------------------------------.
+|         Byte[0]         |         Byte[1]         |
+|  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |
+|-------+-----------------+-------------+-----------|
+|  1  0 |  green diff     |   dr - dg   |  db - dg  |
+`---------------------------------------------------`
+2-bit tag b10
+6-bit green channel difference from the previous pixel -32..31
+4-bit   red channel difference minus green channel difference -8..7
+4-bit  blue channel difference minus green channel difference -8..7
+
+The green channel is used to indicate the general direction of change and is
+encoded in 6 bits. The red and blue channels (dr and db) base their diffs off
+of the green channel difference and are encoded in 4 bits. I.e.:
+	dr_dg = (cur_px.r - prev_px.r) - (cur_px.g - prev_px.g)
+	db_dg = (cur_px.b - prev_px.b) - (cur_px.g - prev_px.g)
+
+The difference to the current channel values are using a wraparound operation,
+so "10 - 13" will result in 253, while "250 + 7" will result in 1.
+
+Values are stored as unsigned integers with a bias of 32 for the green channel
+and a bias of 8 for the red and blue channel.
+
+The alpha value remains unchanged from the previous pixel.
+
+.- QOI_OP_RUN ------------.
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----------------|
+|  1  1 |       run       |
+`-------------------------`
+2-bit tag b11
+6-bit run-length repeating the previous pixel: 1..62
+
+Note that the run-lengths 63 and 64
+(b111110 and b111111) are illegal as they are occupied by the QOI_OP_RGB and
+QOI_OP_RGBA tags.
+
+The run-length is stored with a bias of -1.
+
+.- QOI_OP_RGB ------------------------------------------.
+|         Byte[0]         | Byte[1] | Byte[2] | Byte[3] |
+|  7  6  5  4  3  2  1  0 | 7 .. 0  | 7 .. 0  | 7 .. 0  |
+|-------------------------+---------+---------+---------|
+|  1  1  1  1  1  1  1  0 |   red   |  green  |  blue   |
+`-------------------------------------------------------`
+8-bit tag b11111110
+8-bit   red channel value
+8-bit green channel value
+8-bit  blue channel value
+
+The alpha value remains unchanged from the previous pixel.
+
+.- QOI_OP_RGBA ---------------------------------------------------.
+|         Byte[0]         | Byte[1] | Byte[2] | Byte[3] | Byte[4] |
+|  7  6  5  4  3  2  1  0 | 7 .. 0  | 7 .. 0  | 7 .. 0  | 7 .. 0  |
+|-------------------------+---------+---------+---------+---------|
+|  1  1  1  1  1  1  1  1 |   red   |  green  |  blue   |  alpha  |
+`-----------------------------------------------------------------`
+8-bit tag b11111111
+8-bit   red channel value
+8-bit green channel value
+8-bit  blue channel value
+8-bit alpha channel value
+
+
+GREYSCALE PIXEL FORMAT 1 byte per pixel format
+
+Grey pixels are encoded as
+ - a run of the previous pixel value
+ - differences to the previous pixel value (two 4-bit differences per byte)
+ - full intensity values
+
+Each chunk starts with a 2-bit tag, followed by a number of bits. The
+bit length of chunks is divisible by 8 - i.e. all chunks are byte aligned. All
+values encoded in these data bits have the most significant bit on the left.
+
+The byte stream's end is marked with 7 0x00 bytes followed a single 0x01 byte.
+
+
+The possible chunks are:
+
+.- QOI_OP1_2DIFF ---------.--------------------------
+|         Byte[0]         |         Byte[1]         |
+|  7  6  5  4  3  2  1  0 |  7  6  5  4  3  2  1  0 |
+|-------+-----------------|------------+------------|
+|  0  1 | diff byte count |    diff0   |   diff1    | ......
+`-------------------------`-------------------------`
+2-bit tag b01
+6-bit difference byte count for the next N bytes (so 2 x N pixels): 1..64
+two 4-bit greyscale differences from the previous pixel, they are found in pairs and 2 fill bytes. There is always 2.
+
+Values are stored as unsigned integers with a bias of 8. E.g. -8 is stored as
+0 (b00). 
+
+The difference to the current channel values are using a wraparound operation,
+so "1 - 2" will result in 255, while "255 + 1" will result in 0.
+
+
+.- QOI_OP1_RAW ------------.-------
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----------------|-------
+|  1  0 |     raw bytes   |  pixel value 1,2,3,etc
+`-------------------------`-------
+2-bit tag b10
+6-bit raw byte data for the next N pixels: 1..64
+8-bit raw data bytes of pixel intensity values
+
+
+The raw byte length is stored with a bias of -1. 
+.- QOI_OP1_RUN ------------.
+|         Byte[0]         |
+|  7  6  5  4  3  2  1  0 |
+|-------+-----------------|
+|  1  1 |       run       |
+`-------------------------`
+2-bit tag b11
+6-bit run-length repeating the previous pixel value: 1..64 pixels
+
+The run-length is stored with a bias of -1.
+
+*/
+
+
+/* -----------------------------------------------------------------------------
+Header - Public functions */
+
+#ifndef QOI_H
+#define QOI_H
+
+#pragma warning(disable : 4996)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* A pointer to a qoi_desc struct has to be supplied to all of qoi's functions.
+It describes either the input format (for qoi_write and qoi_encode), or is
+filled with the description read from the file header (for qoi_read and
+qoi_decode).
+
+The colorspace in this qoi_desc is an enum where
+	0 = sRGB, i.e. gamma scaled RGB channels and a linear alpha channel
+	1 = all channels are linear
+You may use the constants QOI_SRGB or QOI_LINEAR. The colorspace is purely
+informative. It will be saved to the file header, but does not affect
+how chunks are en-/decoded. */
+
+#define QOI_SRGB   0
+#define QOI_LINEAR 1
+
+typedef struct {
+	unsigned int width;
+	unsigned int height;
+	unsigned char channels;
+	unsigned char colorspace;
+} qoi_desc;
+
+#ifndef QOI_NO_STDIO
+
+/* Encode raw RGB or RGBA pixels into a QOI image and write it to the file
+system. The qoi_desc struct must be filled with the image width, height,
+number of channels (3 = RGB, 4 = RGBA) and the colorspace.
+
+The function returns 0 on failure (invalid parameters, or fopen or malloc
+failed) or the number of bytes written on success. */
+
+int qoi_write(const char *filename, const void *data, const qoi_desc *desc);
+
+
+/* Read and decode a QOI image from the file system. If channels is 0, the
+number of channels from the file header is used. If channels is 3 or 4 the
+output format will be forced into this number of channels.
+
+The function either returns NULL on failure (invalid data, or malloc or fopen
+failed) or a pointer to the decoded pixels. On success, the qoi_desc struct
+will be filled with the description from the file header.
+
+The returned pixel data should be free()d after use. */
+
+void *qoi_read(const char *filename, qoi_desc *desc, int channels);
+
+#endif /* QOI_NO_STDIO */
+
+
+/* Encode raw RGB or RGBA pixels into a QOI image in memory.
+
+The function either returns NULL on failure (invalid parameters or malloc
+failed) or a pointer to the encoded data on success. On success the out_len
+is set to the size in bytes of the encoded data.
+
+The returned qoi data should be free()d after use. */
+
+void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len);
+
+
+/* Decode a QOI image from memory.
+
+The function either returns NULL on failure (invalid parameters or malloc
+failed) or a pointer to the decoded pixels. On success, the qoi_desc struct
+is filled with the description from the file header.
+
+The returned pixel data should be free()d after use. */
+
+void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* QOI_H */
+
+
+/* -----------------------------------------------------------------------------
+Implementation */
+
+#ifdef QOI_IMPLEMENTATION
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef QOI_MALLOC
+	#define QOI_MALLOC(sz) malloc(sz)
+	#define QOI_FREE(p)    free(p)
+#endif
+#ifndef QOI_ZEROARR
+	#define QOI_ZEROARR(a) memset((a),0,sizeof(a))
+#endif
+
+#define QOI_OP_INDEX  0x00 /* 00xxxxxx */
+#define QOI_OP_DIFF   0x40 /* 01xxxxxx */
+#define QOI_OP_LUMA   0x80 /* 10xxxxxx */
+#define QOI_OP_RUN    0xc0 /* 11xxxxxx */
+#define QOI_OP_RGB    0xfe /* 11111110 */
+#define QOI_OP_RGBA   0xff /* 11111111 */
+
+#define QOI_MASK_2    0xc0 /* 11000000 */
+#define QOI_MASK_3    0xe0 /* 11100000 */
+
+#define QOI_COLOR_HASH(C) (C.rgba.r*3 + C.rgba.g*5 + C.rgba.b*7 + C.rgba.a*11)
+
+#define QOI_MAGIC \
+	(((unsigned int)'q') << 24 | ((unsigned int)'o') << 16 | \
+	 ((unsigned int)'i') <<  8 | ((unsigned int)'f'))
+#define QOI_HEADER_SIZE 14
+
+/* 2GB is the max file size that this implementation can safely handle. We guard
+against anything larger than that, assuming the worst case with 5 bytes per
+pixel, rounded down to a nice clean value. 400 million pixels ought to be
+enough for anybody. */
+#define QOI_PIXELS_MAX ((unsigned int)400000000)
+
+typedef union {
+	struct { unsigned char r, g, b, a; } rgba;
+	unsigned int v;
+} qoi_rgba_t;
+
+static const unsigned char qoi_padding[8] = {0,0,0,0,0,0,0,1};
+
+static void qoi_write_32(unsigned char *bytes, int *p, unsigned int v) {
+	bytes[(*p)++] = (0xff000000 & v) >> 24;
+	bytes[(*p)++] = (0x00ff0000 & v) >> 16;
+	bytes[(*p)++] = (0x0000ff00 & v) >> 8;
+	bytes[(*p)++] = (0x000000ff & v);
+}
+
+static unsigned int qoi_read_32(const unsigned char *bytes, int *p) {
+	unsigned int a = bytes[(*p)++];
+	unsigned int b = bytes[(*p)++];
+	unsigned int c = bytes[(*p)++];
+	unsigned int d = bytes[(*p)++];
+	return a << 24 | b << 16 | c << 8 | d;
+}
+
+//turn features on and off for testing, raw is always necessary
+#define ENABLE_RUN 
+#define ENABLE_2DIFF 
+//define ENABLE_COUNTS  //turn on keeping/printing of statistics
+
+#define QOI_OP1_2DIFF  0x40 /* 01xxxxxx 4bit difference*/
+#define QOI_OP1_RAW    0x80 /* 10xxxxxx raw bytes */
+#define QOI_OP1_RUN    0xc0 /* 11xxxxxx run length*/
+
+void *qoi_grey_encode(const void *data, const qoi_desc *desc, int *out_len) {
+	
+	int i, max_size, p;
+
+	unsigned char run;
+
+	int px_len, px_end, px_pos, channels, px_stop;
+	
+	unsigned char *bytes;
+
+	unsigned char mode, mode_next;
+	const unsigned char *pixels;
+	const unsigned char *thispixel;
+
+	unsigned char px, px_prev;
+	unsigned char px1;
+
+	signed char dv0, dv1; //deltas between pixels
+
+	unsigned char *run_start=NULL;
+
+#ifdef ENABLE_COUNTS
+	unsigned long count_run = 0;
+	unsigned long count_raw = 0;
+	unsigned long count_2diff = 0;
+#endif
+
+	if (
+		data == NULL || out_len == NULL || desc == NULL ||
+		desc->width == 0 || desc->height == 0 ||
+		desc->channels != 1 ||
+   		desc->colorspace > 1 ||
+		desc->height >= QOI_PIXELS_MAX / desc->width
+	) {
+		return NULL;
+	}
+
+	max_size =
+		desc->width * desc->height * (desc->channels + 1) +
+		QOI_HEADER_SIZE + sizeof(qoi_padding);
+
+	p = 0;
+	bytes = (unsigned char *) QOI_MALLOC(max_size);
+	if (!bytes) {
+		return NULL;
+	}
+
+	qoi_write_32(bytes, &p, QOI_MAGIC);
+	qoi_write_32(bytes, &p, desc->width);
+	qoi_write_32(bytes, &p, desc->height);
+	bytes[p++] = desc->channels;
+	bytes[p++] = desc->colorspace;
+
+
+	pixels = (const unsigned char *)data;
+
+	run = 0;
+	mode = 0;
+	px_prev = 0;
+	px = px_prev;
+	px1 = px;
+
+	px_len = desc->width * desc->height * desc->channels;
+	px_end = px_len - desc->channels; //move back the to the beginning of the last pixel
+	px_stop = px_end - 2*(desc->channels); //this is the last pixel we can look at in the main loop
+	channels = desc->channels;
+
+	//always do the first byte in raw
+	run_start = &(bytes[p]);
+	bytes[p++] = QOI_OP1_RAW; //we don't know how many bytes yet, but output the 2 type bits
+	bytes[p++] = *(pixels);
+	run = 1;
+#ifdef ENABLE_COUNTS
+	count_raw++;
+#endif	
+	mode = QOI_OP1_RAW;
+	px_prev = *(pixels);
+	thispixel = pixels + 1;
+
+	for (px_pos = 1; px_pos < px_stop; px_pos++) {
+
+		//thispixel = pixels + px_pos;
+		
+		px = *(thispixel++);
+		px1 = *(thispixel);
+
+#ifdef ENABLE_RUN
+		if (px == px_prev)
+		{  //same value - run length
+			mode_next = QOI_OP1_RUN;
+#ifdef ENABLE_COUNTS
+			count_run++;
+#endif
+
+		}
+		else
+#endif //ENABLE_RUN
+		{
+			dv0 = px - px_prev; //deltas between pixels
+			dv1 = px1 - px;
+
+#ifdef ENABLE_2DIFF
+			if (dv0 > -9 && dv0 < 8 &&
+				dv1 > -9 && dv1 < 8
+				)
+			{
+				mode_next = QOI_OP1_2DIFF;
+#ifdef ENABLE_COUNTS
+				count_2diff += 2;
+#endif
+			}
+			else
+#endif //ENABLE_2DIFF
+			{ //default is RAW bytes
+				mode_next = QOI_OP1_RAW;
+#ifdef ENABLE_COUNTS
+				count_raw++;
+#endif	
+			}
+		}
+
+		if (mode_next != mode || run == 62 ) //we're switching to a different mode, close one mode and start the next
+		{
+			*run_start = (*run_start) | (run - 1);
+			run = 0;
+
+			run_start = &(bytes[p]);
+			bytes[p++] = mode_next; //we don't know how many bytes yet, but output the 2 type bits
+			
+			mode = mode_next;
+		}
+
+		run++;// we've always created one byte of output data 
+
+#ifdef ENABLE_2DIFF
+		if (mode == QOI_OP1_2DIFF)
+		{
+			bytes[p++] = (dv0 + 8) << 4 | (dv1 + 8);
+			px_pos++;
+			thispixel++;
+			px_prev = px1;
+		}
+		else
+#endif
+		if (mode == QOI_OP1_RAW)
+		{
+			bytes[p++] = px;
+			px_prev = px;
+		}
+
+	}
+	
+	//close out the last one
+	*run_start = (*run_start) | (run - 1);
+	run=0;
+
+	if (px_pos != px_end) //if we have any left over pixels, store them RAW
+	{
+		run_start = &(bytes[p]);
+		bytes[p++] = QOI_OP1_RAW; //we don't know how many bytes yet, but output the 2 type bits
+
+		while (px_pos <= px_end)
+		{
+			bytes[p++] = *(thispixel++);
+			px_pos++;
+			run++;
+#ifdef ENABLE_COUNTS
+			count_raw++;
+#endif	
+
+		}
+		*run_start = (*run_start) | (run - 1);
+		run = 0;
+	}
+
+	for (i = 0; i < (int)sizeof(qoi_padding); i++) {
+		bytes[p++] = qoi_padding[i];
+	}
+
+#ifdef ENABLE_COUNTS
+	unsigned long count_total = count_run+ count_raw+ count_2diff;
+	double dcount_total = count_total;
+
+	printf("encode counts: run %ld, raw %ld, 2diff %ld, total %ld, inlen %ld, outlen %ld\n",
+		count_run, count_raw, count_2diff, count_total, px_len,p);
+	printf("encode percent: run %.2f, raw %.2f, 2diff %.2f, out %.3f\n",
+		count_run/dcount_total, count_raw / dcount_total, count_2diff / dcount_total, p/(double)(px_len));
+	
+#endif
+
+	*out_len = p;
+	return bytes;
+}
+
+void *qoi_grey_decode(const void *data, int size, qoi_desc *desc, int channels) {
+	const unsigned char *bytes;
+	unsigned int header_magic;
+	unsigned char *pixels;
+
+	//qoi_rgba_t index[64];
+	unsigned char px;
+	int px_len, chunks_len, px_pos;
+	int p = 0, run = 0, raw=0;
+
+	if (
+		data == NULL || desc == NULL ||
+		(channels != 0 && channels != 1) ||
+		size < QOI_HEADER_SIZE + (int)sizeof(qoi_padding)
+	) {
+		return NULL;
+	}
+
+	bytes = (const unsigned char *)data;
+
+	header_magic = qoi_read_32(bytes, &p);
+	desc->width = qoi_read_32(bytes, &p);
+	desc->height = qoi_read_32(bytes, &p);
+	desc->channels = bytes[p++];
+	desc->colorspace = bytes[p++];
+
+	if (
+		desc->width == 0 || desc->height == 0 ||
+		//desc->channels < 3 || desc->channels > 4 ||
+		desc->channels != 1 ||
+ 		desc->colorspace > 1 ||
+		header_magic != QOI_MAGIC ||
+		desc->height >= QOI_PIXELS_MAX / desc->width
+	) {
+		return NULL;
+	}
+
+	if (channels == 0) {
+		channels = desc->channels;
+	}
+
+	px_len = desc->width * desc->height * channels;
+	pixels = (unsigned char *) QOI_MALLOC(px_len);
+	if (!pixels) {
+		return NULL;
+	}
+
+	px = 0;
+	
+	chunks_len = size - (int)sizeof(qoi_padding);
+	for (px_pos = 0; px_pos < px_len; px_pos++) 
+	{
+		int b1 = bytes[p++];
+
+		if ((b1 & QOI_MASK_2) == QOI_OP1_2DIFF) {
+			run = (b1 & 0x3f);
+			while (run >= 0)
+			{
+				b1 = bytes[p++];
+				px += ((b1 >> 4) & 0x0f) - 8;
+
+				//dump a pixel here
+				if (px_pos < px_len)
+				{
+					*(pixels + px_pos) = px;
+				}
+				px_pos++;
+
+				px += ((b1) & 0x0f) - 8;
+
+				if (px_pos < px_len)
+				{
+					*(pixels + px_pos) = px;
+				}
+				px_pos++;
+
+				run--;
+			}
+			px_pos--;
+
+		}
+		else if ((b1 & QOI_MASK_2) == QOI_OP1_RUN) {
+			run = (b1 & 0x3f);
+			while (run >= 0)
+			{
+				if (px_pos < px_len)
+				{
+					*(pixels + px_pos) = px;
+				}
+				px_pos++;
+				run--;
+			}
+			px_pos--;
+		}
+		else if ((b1 & QOI_MASK_2) == QOI_OP1_RAW) {
+			raw = (b1 & 0x3f);
+			while (raw >= 0)
+			{
+				px = bytes[p++];
+				if (px_pos < px_len)
+				{
+					*(pixels + px_pos) = px;
+				}
+				px_pos++;
+				raw--;
+			}
+			px_pos--;
+		}
+	}
+
+	return pixels;
+}
+
+void* qoi_colour_encode(const void* data, const qoi_desc* desc, int* out_len) {
+	int i, max_size, p, run;
+	int px_len, px_end, px_pos, channels;
+	unsigned char* bytes;
+	const unsigned char* pixels;
+	qoi_rgba_t index[64];
+	qoi_rgba_t px, px_prev;
+
+	if (
+		data == NULL || out_len == NULL || desc == NULL ||
+		desc->width == 0 || desc->height == 0 ||
+		desc->channels < 3 || desc->channels > 4 ||
+		desc->colorspace > 1 ||
+		desc->height >= QOI_PIXELS_MAX / desc->width
+		) {
+		return NULL;
+	}
+
+	max_size =
+		desc->width * desc->height * (desc->channels + 1) +
+		QOI_HEADER_SIZE + sizeof(qoi_padding);
+
+	p = 0;
+	bytes = (unsigned char*)QOI_MALLOC(max_size);
+	if (!bytes) {
+		return NULL;
+	}
+
+	qoi_write_32(bytes, &p, QOI_MAGIC);
+	qoi_write_32(bytes, &p, desc->width);
+	qoi_write_32(bytes, &p, desc->height);
+	bytes[p++] = desc->channels;
+	bytes[p++] = desc->colorspace;
+
+
+	pixels = (const unsigned char*)data;
+
+	QOI_ZEROARR(index);
+
+	run = 0;
+	px_prev.rgba.r = 0;
+	px_prev.rgba.g = 0;
+	px_prev.rgba.b = 0;
+	px_prev.rgba.a = 255;
+	px = px_prev;
+
+	px_len = desc->width * desc->height * desc->channels;
+	px_end = px_len - desc->channels;
+	channels = desc->channels;
+
+	for (px_pos = 0; px_pos < px_len; px_pos += channels) {
+		if (channels == 4) {
+			px = *(qoi_rgba_t*)(pixels + px_pos);
+		}
+		else {
+			px.rgba.r = pixels[px_pos + 0];
+			px.rgba.g = pixels[px_pos + 1];
+			px.rgba.b = pixels[px_pos + 2];
+		}
+
+		if (px.v == px_prev.v) {
+			run++;
+			if (run == 62 || px_pos == px_end) {
+				bytes[p++] = QOI_OP_RUN | (run - 1);
+				run = 0;
+			}
+		}
+		else {
+			int index_pos;
+
+			if (run > 0) {
+				bytes[p++] = QOI_OP_RUN | (run - 1);
+				run = 0;
+			}
+
+			index_pos = QOI_COLOR_HASH(px) % 64;
+
+			if (index[index_pos].v == px.v) {
+				bytes[p++] = QOI_OP_INDEX | index_pos;
+			}
+			else {
+				index[index_pos] = px;
+
+				if (px.rgba.a == px_prev.rgba.a) {
+					signed char vr = px.rgba.r - px_prev.rgba.r;
+					signed char vg = px.rgba.g - px_prev.rgba.g;
+					signed char vb = px.rgba.b - px_prev.rgba.b;
+
+					signed char vg_r = vr - vg;
+					signed char vg_b = vb - vg;
+
+					if (
+						vr > -3 && vr < 2 &&
+						vg > -3 && vg < 2 &&
+						vb > -3 && vb < 2
+						) {
+						bytes[p++] = QOI_OP_DIFF | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
+					}
+					else if (
+						vg_r > -9 && vg_r <  8 &&
+						vg   > -33 && vg   < 32 &&
+						vg_b >  -9 && vg_b < 8
+						) {
+						bytes[p++] = QOI_OP_LUMA | (vg + 32);
+						bytes[p++] = (vg_r + 8) << 4 | (vg_b + 8);
+					}
+					else {
+						bytes[p++] = QOI_OP_RGB;
+						bytes[p++] = px.rgba.r;
+						bytes[p++] = px.rgba.g;
+						bytes[p++] = px.rgba.b;
+					}
+				}
+				else {
+					bytes[p++] = QOI_OP_RGBA;
+					bytes[p++] = px.rgba.r;
+					bytes[p++] = px.rgba.g;
+					bytes[p++] = px.rgba.b;
+					bytes[p++] = px.rgba.a;
+				}
+			}
+		}
+		px_prev = px;
+	}
+
+	for (i = 0; i < (int)sizeof(qoi_padding); i++) {
+		bytes[p++] = qoi_padding[i];
+	}
+
+	*out_len = p;
+	return bytes;
+}
+
+void* qoi_colour_decode(const void* data, int size, qoi_desc* desc, int channels) {
+	const unsigned char* bytes;
+	unsigned int header_magic;
+	unsigned char* pixels;
+	qoi_rgba_t index[64];
+	qoi_rgba_t px;
+	int px_len, chunks_len, px_pos;
+	int p = 0, run = 0;
+
+	if (
+		data == NULL || desc == NULL ||
+		(channels != 0 && channels != 3 && channels != 4) ||
+		size < QOI_HEADER_SIZE + (int)sizeof(qoi_padding)
+		) {
+		return NULL;
+	}
+
+	bytes = (const unsigned char*)data;
+
+	header_magic = qoi_read_32(bytes, &p);
+	desc->width = qoi_read_32(bytes, &p);
+	desc->height = qoi_read_32(bytes, &p);
+	desc->channels = bytes[p++];
+	desc->colorspace = bytes[p++];
+
+	if (
+		desc->width == 0 || desc->height == 0 ||
+		desc->channels < 3 || desc->channels > 4 ||
+		desc->colorspace > 1 ||
+		header_magic != QOI_MAGIC ||
+		desc->height >= QOI_PIXELS_MAX / desc->width
+		) {
+		return NULL;
+	}
+
+	if (channels == 0) {
+		channels = desc->channels;
+	}
+
+	px_len = desc->width * desc->height * channels;
+	pixels = (unsigned char*)QOI_MALLOC(px_len);
+	if (!pixels) {
+		return NULL;
+	}
+
+	QOI_ZEROARR(index);
+	px.rgba.r = 0;
+	px.rgba.g = 0;
+	px.rgba.b = 0;
+	px.rgba.a = 255;
+
+	chunks_len = size - (int)sizeof(qoi_padding);
+	for (px_pos = 0; px_pos < px_len; px_pos += channels) {
+		if (run > 0) {
+			run--;
+		}
+		else if (p < chunks_len) {
+			int b1 = bytes[p++];
+
+			if (b1 == QOI_OP_RGB) {
+				px.rgba.r = bytes[p++];
+				px.rgba.g = bytes[p++];
+				px.rgba.b = bytes[p++];
+			}
+			else if (b1 == QOI_OP_RGBA) {
+				px.rgba.r = bytes[p++];
+				px.rgba.g = bytes[p++];
+				px.rgba.b = bytes[p++];
+				px.rgba.a = bytes[p++];
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_INDEX) {
+				px = index[b1];
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_DIFF) {
+				px.rgba.r += ((b1 >> 4) & 0x03) - 2;
+				px.rgba.g += ((b1 >> 2) & 0x03) - 2;
+				px.rgba.b += (b1 & 0x03) - 2;
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_LUMA) {
+				int b2 = bytes[p++];
+				int vg = (b1 & 0x3f) - 32;
+				px.rgba.r += vg - 8 + ((b2 >> 4) & 0x0f);
+				px.rgba.g += vg;
+				px.rgba.b += vg - 8 + (b2 & 0x0f);
+			}
+			else if ((b1 & QOI_MASK_2) == QOI_OP_RUN) {
+				run = (b1 & 0x3f);
+			}
+
+			index[QOI_COLOR_HASH(px) % 64] = px;
+		}
+
+		if (channels == 4) {
+			*(qoi_rgba_t*)(pixels + px_pos) = px;
+		}
+		else {
+			pixels[px_pos + 0] = px.rgba.r;
+			pixels[px_pos + 1] = px.rgba.g;
+			pixels[px_pos + 2] = px.rgba.b;
+		}
+	}
+
+	return pixels;
+}
+
+
+void *qoi_encode(const void *data, const qoi_desc *desc, int *out_len) {
+
+	if(desc->channels == 3 || desc->channels == 4)
+	{
+		return	qoi_colour_encode(data, desc, out_len);
+	}
+	else if (desc->channels == 1)
+	{
+		return	qoi_grey_encode(data, desc, out_len);
+	}
+}
+
+void *qoi_decode(const void *data, int size, qoi_desc *desc, int channels)
+ {
+
+	void *retptr=NULL;
+	
+	//try it as colour
+	retptr=qoi_colour_decode(data, size, desc, channels);
+	
+	//if not try it as grey
+	if(retptr==NULL)
+	{
+		retptr=qoi_grey_decode(data, size, desc, channels);
+	}
+	
+	return retptr;
+}
+
+
+#ifndef QOI_NO_STDIO
+#include <stdio.h>
+
+int qoi_write(const char *filename, const void *data, const qoi_desc *desc) {
+	FILE *f = fopen(filename, "wb");
+	int size;
+	void *encoded;
+
+	if (!f) {
+		return 0;
+	}
+
+	encoded = qoi_encode(data, desc, &size);
+	if (!encoded) {
+		fclose(f);
+		return 0;
+	}
+
+	fwrite(encoded, 1, size, f);
+	fclose(f);
+
+	QOI_FREE(encoded);
+	return size;
+}
+
+void *qoi_read(const char *filename, qoi_desc *desc, int channels) {
+	FILE *f = fopen(filename, "rb");
+	int size, bytes_read;
+	void *pixels, *data;
+
+	if (!f) {
+		return NULL;
+	}
+
+	fseek(f, 0, SEEK_END);
+	size = ftell(f);
+	if (size <= 0) {
+		fclose(f);
+		return NULL;
+	}
+	fseek(f, 0, SEEK_SET);
+
+	data = QOI_MALLOC(size);
+	if (!data) {
+		fclose(f);
+		return NULL;
+	}
+
+	bytes_read = fread(data, 1, size, f);
+	fclose(f);
+
+	pixels = qoi_decode(data, bytes_read, desc, channels);
+	QOI_FREE(data);
+	return pixels;
+}
+
+#endif /* QOI_NO_STDIO */
+#endif /* QOI_IMPLEMENTATION */
diff --git a/ouster_osf/src/zpng.cpp b/ouster_osf/src/zpng.cpp
new file mode 100644
index 00000000..3f97e120
--- /dev/null
+++ b/ouster_osf/src/zpng.cpp
@@ -0,0 +1,581 @@
+/** \file
+    \brief Zpng - Experimental Lossless Image Compressor
+    \copyright Copyright (c) 2018 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of Zpng nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "zpng.h"
+
+#include <zstd.h>
+
+#include <stdlib.h> // calloc
+
+
+//------------------------------------------------------------------------------
+// Constants
+
+// Higher compression levels do not gain much but hurt speed
+static const int kCompressionLevel = 1;
+
+// This enabled some specialized versions for RGB and RGBA
+#define ENABLE_RGB_COLOR_FILTER
+
+// Header definitions
+#define ZPNG_HEADER_MAGIC 0xFBF8
+#define ZPNG_HEADER_OVERHEAD_BYTES 8
+
+// File format header
+struct ZPNG_Header
+{
+    uint16_t Magic;
+    uint16_t Width;
+    uint16_t Height;
+    uint8_t Channels;
+    uint8_t BytesPerChannel;
+};
+
+
+//------------------------------------------------------------------------------
+// Image Processing
+
+// Interleaving is a 1% compression win, and a 0.3% performance win: Not used.
+// Splitting the data into blocks of 4 at a time actually reduces compression.
+
+template<int kChannels>
+static void PackAndFilter(
+    const ZPNG_ImageData* imageData,
+    uint8_t* output
+)
+{
+    const unsigned height = imageData->HeightPixels;
+    const unsigned width = imageData->WidthPixels;
+
+    const uint8_t* input = imageData->Buffer.Data;
+
+    for (unsigned y = 0; y < height; ++y)
+    {
+        uint8_t prev[kChannels] = { 0 };
+
+        for (unsigned x = 0; x < width; ++x)
+        {
+            // For each channel:
+            for (unsigned i = 0; i < kChannels; ++i)
+            {
+                uint8_t a = input[i];
+                uint8_t d = a - prev[i];
+                output[i] = d;
+                prev[i] = a;
+            }
+
+            input += kChannels;
+            output += kChannels;
+        }
+    }
+}
+
+template<int kChannels>
+static void UnpackAndUnfilter(
+    const uint8_t* input,
+    ZPNG_ImageData* imageData
+)
+{
+    const unsigned height = imageData->HeightPixels;
+    const unsigned width = imageData->WidthPixels;
+
+    uint8_t* output = imageData->Buffer.Data;
+
+    for (unsigned y = 0; y < height; ++y)
+    {
+        uint8_t prev[kChannels] = { 0 };
+
+        for (unsigned x = 0; x < width; ++x)
+        {
+            // For each channel:
+            for (unsigned i = 0; i < kChannels; ++i)
+            {
+                uint8_t d = input[i];
+                uint8_t a = d + prev[i];
+                output[i] = a;
+                prev[i] = a;
+            }
+
+            input += kChannels;
+            output += kChannels;
+        }
+    }
+}
+
+
+#ifdef ENABLE_RGB_COLOR_FILTER
+
+template<>
+void PackAndFilter<3>(
+    const ZPNG_ImageData* imageData,
+    uint8_t* output
+    )
+{
+    static const unsigned kChannels = 3;
+
+    const unsigned height = imageData->HeightPixels;
+    const unsigned width = imageData->WidthPixels;
+
+    const uint8_t* input = imageData->Buffer.Data;
+
+    // Color plane split
+    const unsigned planeBytes = width * height;
+    uint8_t* output_y = output;
+    uint8_t* output_u = output + planeBytes;
+    uint8_t* output_v = output + planeBytes * 2;
+
+    for (unsigned row = 0; row < height; ++row)
+    {
+        uint8_t prev[kChannels] = { 0 };
+
+        for (unsigned x = 0; x < width; ++x)
+        {
+            uint8_t r = input[0];
+            uint8_t g = input[1];
+            uint8_t b = input[2];
+
+            r -= prev[0];
+            g -= prev[1];
+            b -= prev[2];
+
+            prev[0] = input[0];
+            prev[1] = input[1];
+            prev[2] = input[2];
+
+            // GB-RG filter from BCIF
+            uint8_t y = b;
+            uint8_t u = g - b;
+            uint8_t v = g - r;
+
+            *output_y++ = y;
+            *output_u++ = u;
+            *output_v++ = v;
+
+            input += kChannels;
+        }
+    }
+}
+
+template<>
+void UnpackAndUnfilter<3>(
+    const uint8_t* input,
+    ZPNG_ImageData* imageData
+    )
+{
+    static const unsigned kChannels = 3;
+
+    const unsigned height = imageData->HeightPixels;
+    const unsigned width = imageData->WidthPixels;
+
+    uint8_t* output = imageData->Buffer.Data;
+
+    // Color plane split
+    const unsigned planeBytes = width * height;
+    const uint8_t* input_y = input;
+    const uint8_t* input_u = input + planeBytes;
+    const uint8_t* input_v = input + planeBytes * 2;
+
+    for (unsigned row = 0; row < height; ++row)
+    {
+        uint8_t prev[kChannels] = { 0 };
+
+        for (unsigned x = 0; x < width; ++x)
+        {
+            uint8_t y = *input_y++;
+            uint8_t u = *input_u++;
+            uint8_t v = *input_v++;
+
+            // GB-RG filter from BCIF
+            const uint8_t B = y;
+            const uint8_t G = u + B;
+            uint8_t r = G - v;
+            uint8_t g = G;
+            uint8_t b = B;
+
+            r += prev[0];
+            g += prev[1];
+            b += prev[2];
+
+            output[0] = r;
+            output[1] = g;
+            output[2] = b;
+
+            prev[0] = r;
+            prev[1] = g;
+            prev[2] = b;
+
+            output += kChannels;
+        }
+    }
+}
+
+
+// Version for RGBA (with alpha):
+
+template<>
+void PackAndFilter<4>(
+    const ZPNG_ImageData* imageData,
+    uint8_t* output
+    )
+{
+    static const unsigned kChannels = 4;
+
+    const unsigned height = imageData->HeightPixels;
+    const unsigned width = imageData->WidthPixels;
+
+    const uint8_t* input = imageData->Buffer.Data;
+
+    // Color plane split
+    const unsigned planeBytes = width * height;
+    uint8_t* output_y = output;
+    uint8_t* output_u = output + planeBytes;
+    uint8_t* output_v = output + planeBytes * 2;
+    uint8_t* output_a = output + planeBytes * 3;
+
+    for (unsigned row = 0; row < height; ++row)
+    {
+        uint8_t prev[kChannels] = { 0 };
+
+        for (unsigned x = 0; x < width; ++x)
+        {
+            uint8_t r = input[0];
+            uint8_t g = input[1];
+            uint8_t b = input[2];
+            uint8_t a = input[3];
+
+            r -= prev[0];
+            g -= prev[1];
+            b -= prev[2];
+            a -= prev[3];
+
+            prev[0] = input[0];
+            prev[1] = input[1];
+            prev[2] = input[2];
+            prev[3] = input[3];
+
+            // GB-RG filter from BCIF
+            uint8_t y = b;
+            uint8_t u = g - b;
+            uint8_t v = g - r;
+
+            *output_y++ = y;
+            *output_u++ = u;
+            *output_v++ = v;
+            *output_a++ = a;
+
+            input += kChannels;
+        }
+    }
+}
+
+template<>
+void UnpackAndUnfilter<4>(
+    const uint8_t* input,
+    ZPNG_ImageData* imageData
+    )
+{
+    static const unsigned kChannels = 4;
+
+    const unsigned height = imageData->HeightPixels;
+    const unsigned width = imageData->WidthPixels;
+
+    uint8_t* output = imageData->Buffer.Data;
+
+    // Color plane split
+    const unsigned planeBytes = width * height;
+    const uint8_t* input_y = input;
+    const uint8_t* input_u = input + planeBytes;
+    const uint8_t* input_v = input + planeBytes * 2;
+    const uint8_t* input_a = input + planeBytes * 3;
+
+    for (unsigned row = 0; row < height; ++row)
+    {
+        uint8_t prev[kChannels] = { 0 };
+
+        for (unsigned x = 0; x < width; ++x)
+        {
+            uint8_t y = *input_y++;
+            uint8_t u = *input_u++;
+            uint8_t v = *input_v++;
+            uint8_t a = *input_a++;
+
+            // GB-RG filter from BCIF
+            const uint8_t B = y;
+            const uint8_t G = u + B;
+            uint8_t r = G - v;
+            uint8_t g = G;
+            uint8_t b = B;
+
+            r += prev[0];
+            g += prev[1];
+            b += prev[2];
+            a += prev[3];
+
+            output[0] = r;
+            output[1] = g;
+            output[2] = b;
+            output[3] = a;
+
+            prev[0] = r;
+            prev[1] = g;
+            prev[2] = b;
+            prev[3] = a;
+
+            output += kChannels;
+        }
+    }
+}
+
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// API
+
+ZPNG_Buffer ZPNG_Compress(
+    const ZPNG_ImageData* imageData
+)
+{
+    uint8_t* packing = nullptr;
+    uint8_t* output = nullptr;
+
+    ZPNG_Buffer bufferOutput;
+    bufferOutput.Data = nullptr;
+    bufferOutput.Bytes = 0;
+
+    const unsigned pixelCount = imageData->WidthPixels * imageData->HeightPixels;
+    const unsigned pixelBytes = imageData->BytesPerChannel * imageData->Channels;
+    const unsigned byteCount = pixelBytes * pixelCount;
+
+    // FIXME: One day add support for other formats
+    if (pixelBytes > 8) {
+        return bufferOutput;
+    }
+
+    // Space for packing
+    packing = (uint8_t*)calloc(1, byteCount);
+
+    if (!packing) {
+ReturnResult:
+        if (bufferOutput.Data != output) {
+            free(output);
+        }
+        free(packing);
+        return bufferOutput;
+    }
+
+    const unsigned maxOutputBytes = (unsigned)ZSTD_compressBound(byteCount);
+
+    // Space for output
+    output = (uint8_t*)calloc(1, ZPNG_HEADER_OVERHEAD_BYTES + maxOutputBytes);
+
+    if (!output) {
+        goto ReturnResult;
+    }
+
+    // Pass 1: Pack and filter data.
+
+    switch (pixelBytes)
+    {
+    case 1:
+        PackAndFilter<1>(imageData, packing);
+        break;
+    case 2:
+        PackAndFilter<2>(imageData, packing);
+        break;
+    case 3:
+        PackAndFilter<3>(imageData, packing);
+        break;
+    case 4:
+        PackAndFilter<4>(imageData, packing);
+        break;
+    case 5:
+        PackAndFilter<5>(imageData, packing);
+        break;
+    case 6:
+        PackAndFilter<6>(imageData, packing);
+        break;
+    case 7:
+        PackAndFilter<7>(imageData, packing);
+        break;
+    case 8:
+        PackAndFilter<8>(imageData, packing);
+        break;
+    }
+
+    // Pass 2: Compress the packed/filtered data.
+
+    const size_t result = ZSTD_compress(
+        output + ZPNG_HEADER_OVERHEAD_BYTES,
+        maxOutputBytes,
+        packing,
+        byteCount,
+        kCompressionLevel);
+
+    if (ZSTD_isError(result)) {
+        goto ReturnResult;
+    }
+
+    // Write header
+
+    ZPNG_Header* header = (ZPNG_Header*)output;
+    header->Magic = ZPNG_HEADER_MAGIC;
+    header->Width = (uint16_t)imageData->WidthPixels;
+    header->Height = (uint16_t)imageData->HeightPixels;
+    header->Channels = (uint8_t)imageData->Channels;
+    header->BytesPerChannel = (uint8_t)imageData->BytesPerChannel;
+
+    bufferOutput.Data = output;
+    bufferOutput.Bytes = ZPNG_HEADER_OVERHEAD_BYTES + (unsigned)result;
+
+    goto ReturnResult;
+}
+
+ZPNG_ImageData ZPNG_Decompress(
+    ZPNG_Buffer buffer
+)
+{
+    uint8_t* packing = nullptr;
+    uint8_t* output = nullptr;
+
+    ZPNG_ImageData imageData;
+    imageData.Buffer.Data = nullptr;
+    imageData.Buffer.Bytes = 0;
+    imageData.BytesPerChannel = 0;
+    imageData.Channels = 0;
+    imageData.HeightPixels = 0;
+    imageData.StrideBytes = 0;
+    imageData.WidthPixels = 0;
+
+    if (!buffer.Data || buffer.Bytes < ZPNG_HEADER_OVERHEAD_BYTES) {
+ReturnResult:
+        if (imageData.Buffer.Data != output) {
+            free(output);
+        }
+        free(packing);
+        return imageData;
+    }
+
+    const ZPNG_Header* header = (const ZPNG_Header*)buffer.Data;
+    if (header->Magic != ZPNG_HEADER_MAGIC) {
+        goto ReturnResult;
+    }
+
+    imageData.WidthPixels = header->Width;
+    imageData.HeightPixels = header->Height;
+    imageData.Channels = header->Channels;
+    imageData.BytesPerChannel = header->BytesPerChannel;
+    imageData.StrideBytes = imageData.WidthPixels * imageData.Channels;
+
+    const unsigned pixelCount = imageData.WidthPixels * imageData.HeightPixels;
+    const unsigned pixelBytes = imageData.BytesPerChannel * imageData.Channels;
+    const unsigned byteCount = pixelBytes * pixelCount;
+
+    // Space for packing
+    packing = (uint8_t*)calloc(1, byteCount);
+
+    if (!packing) {
+        goto ReturnResult;
+    }
+
+    // Stage 1: Decompress back to packing buffer
+
+    const size_t result = ZSTD_decompress(
+        packing,
+        byteCount,
+        buffer.Data + ZPNG_HEADER_OVERHEAD_BYTES,
+        buffer.Bytes - ZPNG_HEADER_OVERHEAD_BYTES);
+
+    if (ZSTD_isError(result)) {
+        goto ReturnResult;
+    }
+
+    // Stage 2: Unpack/Unfilter
+
+    // Space for output
+    output = (uint8_t*)calloc(1, byteCount);
+
+    if (!output) {
+        goto ReturnResult;
+    }
+
+    imageData.Buffer.Data = output;
+    imageData.Buffer.Bytes = byteCount;
+
+    switch (pixelBytes)
+    {
+    case 1:
+        UnpackAndUnfilter<1>(packing, &imageData);
+        break;
+    case 2:
+        UnpackAndUnfilter<2>(packing, &imageData);
+        break;
+    case 3:
+        UnpackAndUnfilter<3>(packing, &imageData);
+        break;
+    case 4:
+        UnpackAndUnfilter<4>(packing, &imageData);
+        break;
+    case 5:
+        UnpackAndUnfilter<5>(packing, &imageData);
+        break;
+    case 6:
+        UnpackAndUnfilter<6>(packing, &imageData);
+        break;
+    case 7:
+        UnpackAndUnfilter<7>(packing, &imageData);
+        break;
+    case 8:
+        UnpackAndUnfilter<8>(packing, &imageData);
+        break;
+    }
+
+    goto ReturnResult;
+}
+
+void ZPNG_Free(
+    ZPNG_Buffer* buffer
+)
+{
+    if (buffer && buffer->Data)
+    {
+        free(buffer->Data);
+        buffer->Data = nullptr;
+        buffer->Bytes = 0;
+    }
+}
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ouster_osf/src/zpng.h b/ouster_osf/src/zpng.h
new file mode 100644
index 00000000..a0d3a086
--- /dev/null
+++ b/ouster_osf/src/zpng.h
@@ -0,0 +1,141 @@
+/** \file
+    \brief Zpng - Experimental Lossless Image Compressor
+    \copyright Copyright (c) 2018 Christopher A. Taylor.  All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of Zpng nor the names of its contributors may be
+      used to endorse or promote products derived from this software without
+      specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CAT_ZPNG_H
+#define CAT_ZPNG_H
+
+/** \page Zpng - Experimental Lossless Image Compressor
+
+    It's much faster than PNG and compresses better for photographic images.
+    This compressor often takes less than 6% of the time of a PNG compressor
+    and produces a file that is 66% of the size.
+    It was written in about 600 lines of C code thanks to the Zstd library.
+
+    This library is similar to PNG in that the image is first filtered,
+    and then submitted to a data compressor.
+    The filtering step is a bit simpler and faster but somehow more effective
+    than the one used in PNG.
+    The data compressor used is Zstd, which makes it significantly faster
+    than PNG to compress and decompress.
+
+    Filtering:
+
+    (1) Reversible color channel transformation.
+    (2) Split each color channel into a separate color plane.
+    (3) Subtract each color value from the one to its left.
+*/
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Buffer returned by library
+struct ZPNG_Buffer
+{
+    // Pointer to data
+    unsigned char* Data;
+
+    // Size of buffer in bytes
+    unsigned Bytes;
+};
+
+// Image data returned by the library
+struct ZPNG_ImageData
+{
+    // Pixel data
+    ZPNG_Buffer Buffer;
+
+    // Number of bytes for each color channel (1-2)
+    unsigned BytesPerChannel;
+
+    // Number of channels for each pixel (1-4)
+    unsigned Channels;
+
+    // Width in pixels of image
+    unsigned WidthPixels;
+
+    // Height in pixels of image
+    unsigned HeightPixels;
+
+    // Width of pixel row in bytes
+    unsigned StrideBytes;
+};
+
+
+//------------------------------------------------------------------------------
+// API
+
+/**
+    ZPNG_Compress()
+
+    Compress image into a buffer.
+
+    The returned buffer should be passed to ZPNG_Free().
+
+    On success returns a valid data pointer.
+    On failure returns a null pointer.
+*/
+ZPNG_Buffer ZPNG_Compress(
+    const ZPNG_ImageData* imageData
+);
+
+/*
+    ZPNG_Decompress()
+
+    Decompress image from a buffer.
+
+    The returned ZPNG_Buffer should be passed to ZPNG_Free().
+
+    On success returns a valid data pointer.
+    On failure returns a null pointer.
+*/
+ZPNG_ImageData ZPNG_Decompress(
+    ZPNG_Buffer buffer
+);
+
+/*
+    ZPNG_Free()
+
+    Free buffer when done to avoid leaks.
+
+    This will also set the buffer data pointer to null.
+*/
+void ZPNG_Free(
+    ZPNG_Buffer* buffer
+);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif // CAT_ZPNG_H
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 3098cbd9..084fd6ec 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -36,6 +36,7 @@ if(NOT SKIP_SDK_FIND)
   find_package(OusterSDK REQUIRED)
 endif()
 
+add_compile_options(-march=native)
 # when building as a top-level project
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
   message(STATUS "Ouster SDK client: Using EIGEN_MAX_ALIGN_BYTES = 32")