larq
diff --git a/‎larq_compute_engine/micro/build_make/build_lcem.sh
+1 b/‎larq_compute_engine/micro/build_make/build_lcem.sh
+1
diff --git a/‎larq_compute_engine/micro/kernels/BUILD
+2 b/‎larq_compute_engine/micro/kernels/BUILD
+2
diff --git a/‎larq_compute_engine/micro/kernels/bmaxpool.cc
+154 b/‎larq_compute_engine/micro/kernels/bmaxpool.cc
+154
diff --git a/‎larq_compute_engine/micro/kernels/micro_ops.h
+1 b/‎larq_compute_engine/micro/kernels/micro_ops.h
+1
diff --git a/‎larq_compute_engine/micro/tests/end2end_test.py
+20-3 b/‎larq_compute_engine/micro/tests/end2end_test.py
+20-3
diff --git a/‎larq_compute_engine/micro/tests/lce_test/lce_test.cc
+3-1 b/‎larq_compute_engine/micro/tests/lce_test/lce_test.cc
+3-1
@@ -100,6 +100,7 @@ if [[ "$projects" == "1" || "$native" == "1" || "$arduino" == "1" || "$stm" == "
         bgemm_functor.h \
         cortexm/bconv2d_impl.h \
         cortexm/bgemv.h \
+        bmaxpool.h \
         packbits.h \
         packbits_utils.h \
         types.h"
 
@@ -12,13 +12,15 @@ cc_library(
     name = "lce_op_kernels",
     srcs = [
         "bconv2d.cc",
+        "bmaxpool.cc",
     ],
     hdrs = [
         "micro_ops.h",
     ],
     copts = micro_copts(),
     deps = [
         "//larq_compute_engine/core:bconv2d_impl_ref",
+        "//larq_compute_engine/core:bmaxpool",
         "//larq_compute_engine/core:packbits_utils",
         "@org_tensorflow//tensorflow/lite/kernels:kernel_util",
         "@org_tensorflow//tensorflow/lite/kernels:padding",
 
@@ -0,0 +1,154 @@
+
+#include "larq_compute_engine/core/bmaxpool.h"
+
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "larq_compute_engine/core/packbits_utils.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+using namespace tflite;
+
+namespace compute_engine {
+namespace tflite {
+namespace maxpool {
+
+using namespace compute_engine::ref;
+using namespace compute_engine::core;
+
+using TBitpacked = std::uint32_t;
+
+struct MicroBMaxPoolParams : public BMaxPoolParams {
+  int packed_input_id;
+};
+
+bool StringEquals(const flexbuffers::String& a, const char* b) {
+  // We use `strcmp` instead of `std::string` to avoid dynamic memory allocation
+  return strcmp(a.c_str(), b) == 0;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  MicroBMaxPoolParams* poolparams = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(MicroBMaxPoolParams),
+                                        (void**)&poolparams) != kTfLiteOk) {
+    context->ReportError(context, "Could not allocate persistent buffer.");
+    return nullptr;
+  }
+
+  const std::uint8_t* buffer_t = reinterpret_cast<const std::uint8_t*>(buffer);
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+
+  poolparams->filter_height = m["filter_height"].AsInt32();
+  poolparams->filter_width = m["filter_width"].AsInt32();
+  poolparams->stride_height = m["stride_height"].AsInt32();
+  poolparams->stride_width = m["stride_width"].AsInt32();
+
+  auto padding_str = m["padding"].AsString();
+  if (StringEquals(padding_str, "VALID") ||
+      StringEquals(padding_str, "valid")) {
+    poolparams->padding_type = kTfLitePaddingValid;
+  } else if (StringEquals(padding_str, "SAME") ||
+             StringEquals(padding_str, "same")) {
+    poolparams->padding_type = kTfLitePaddingSame;
+  } else {
+    context->ReportError(context, "Bmaxpool2d: invalid padding attribute.");
+  }
+  return poolparams;
+}
+
+// The only thing done in Prepare is asserts
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  MicroBMaxPoolParams* poolparams =
+      reinterpret_cast<MicroBMaxPoolParams*>(node->user_data);
+
+  TF_LITE_ENSURE(context, poolparams != nullptr);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt32);
+
+  int batches = input->dims->data[0];
+  int height = input->dims->data[1];
+  int width = input->dims->data[2];
+
+  int out_width, out_height;
+  poolparams->padding = ComputePaddingHeightWidth(
+      poolparams->stride_height, poolparams->stride_width, 1, 1, height, width,
+      poolparams->filter_height, poolparams->filter_width,
+      poolparams->padding_type, &out_height, &out_width);
+
+  int channels_out = 0;
+  if (input->type == kTfLiteFloat32 || input->type == kTfLiteInt8) {
+    channels_out = GetPackedSize<TBitpacked>(input->dims->data[3]);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt32);
+    channels_out = input->dims->data[3];
+  }
+
+  // Use temoprary tensor for bitpacked inputs
+  if (input->type == kTfLiteFloat32 || input->type == kTfLiteInt8) {
+    int flat_size =
+        batches * height * width * channels_out * sizeof(TBitpacked);
+
+    TF_LITE_ENSURE_OK(context,
+                      context->RequestScratchBufferInArena(
+                          context, flat_size, &poolparams->packed_input_id));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  MicroBMaxPoolParams* poolparams =
+      reinterpret_cast<MicroBMaxPoolParams*>(node->user_data);
+
+  TF_LITE_ENSURE(context, poolparams != nullptr);
+
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+
+  const TBitpacked* packed_input_data;
+  RuntimeShape packed_input_shape;
+
+  if (input->type == kTfLiteFloat32) {
+    TBitpacked* packed_input = reinterpret_cast<TBitpacked*>(
+        context->GetScratchBuffer(context, poolparams->packed_input_id));
+    ce::core::packbits_tensor<ce::core::BitpackOrder::Canonical>(
+        GetTensorShape(input), GetTensorData<float>(input), 0,
+        packed_input_shape, packed_input);
+    packed_input_data = packed_input;
+  } else if (input->type == kTfLiteInt8) {
+    TBitpacked* packed_input = reinterpret_cast<TBitpacked*>(
+        context->GetScratchBuffer(context, poolparams->packed_input_id));
+    ce::core::packbits_tensor<ce::core::BitpackOrder::Canonical>(
+        GetTensorShape(input), GetTensorData<std::int8_t>(input),
+        input->params.zero_point, packed_input_shape, packed_input);
+    packed_input_data = packed_input;
+  } else {
+    TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt32);
+    packed_input_shape.ReplaceWith(4, GetTensorShape(input).DimsData());
+    packed_input_data = GetTensorData<TBitpacked>(input);
+  }
+
+  BMaxPoolParams* pp = poolparams;
+  BMaxPool(*pp, packed_input_shape, packed_input_data, GetTensorShape(output),
+           GetTensorData<TBitpacked>(output));
+
+  return kTfLiteOk;
+}
+
+}  // namespace maxpool
+
+TfLiteRegistration* Register_BMAXPOOL_2D() {
+  static TfLiteRegistration r = {maxpool::Init, nullptr, maxpool::Prepare,
+                                 maxpool::Eval};
+  return &r;
+}
+
+}  // namespace tflite
+}  // namespace compute_engine
@@ -8,6 +8,7 @@ namespace tflite {
 
 TfLiteRegistration* Register_BCONV_2D();
 TfLiteRegistration* Register_BCONV_2D_NoFloat();
+TfLiteRegistration* Register_BMAXPOOL_2D();
 
 }  // namespace tflite
 }  // namespace compute_engine
 
@@ -33,14 +33,31 @@ def quant(x):
 def toy_model_int8(**kwargs):
     img = tf.keras.layers.Input(shape=(16, 16, 3))
     x = quant(img)
+    x = tf.keras.layers.MaxPooling2D((2, 2))(x)  # Binary maxpool
     x = lq.layers.QuantConv2D(
-        32, 3, input_quantizer="ste_sign", kernel_quantizer="ste_sign", activation=quant
+        32,
+        3,
+        input_quantizer="ste_sign",
+        kernel_quantizer="ste_sign",
+        padding="same",
+        pad_values=1.0,
     )(x)
+    x = tf.keras.layers.MaxPooling2D((2, 2))(x)  # Binary maxpool
     x = lq.layers.QuantConv2D(
-        64, 3, input_quantizer="ste_sign", kernel_quantizer="ste_sign", activation=quant
+        64,
+        3,
+        input_quantizer="ste_sign",
+        kernel_quantizer="ste_sign",
+        padding="same",
+        pad_values=1.0,
     )(x)
     x = lq.layers.QuantConv2D(
-        32, 3, input_quantizer="ste_sign", kernel_quantizer="ste_sign", activation=quant
+        32,
+        3,
+        input_quantizer="ste_sign",
+        kernel_quantizer="ste_sign",
+        padding="same",
+        pad_values=1.0,
     )(x)
     x = global_pool(x)
     x = lq.layers.QuantDense(
 
@@ -48,7 +48,7 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
                          model->version(), TFLITE_SCHEMA_VERSION);
   }
 
-  tflite::MicroOpResolver<9> resolver;
+  tflite::MicroOpResolver<10> resolver;
   resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
                       tflite::ops::micro::Register_CONV_2D(), 3, 3);
   resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
@@ -67,6 +67,8 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
                       tflite::ops::micro::Register_DEQUANTIZE(), 2, 2);
   resolver.AddCustom("LceBconv2d",
                      compute_engine::tflite::Register_BCONV_2D_NoFloat());
+  resolver.AddCustom("LceBMaxPool2d",
+                     compute_engine::tflite::Register_BMAXPOOL_2D());
 
   // Create an area of memory to use for input, output, and intermediate arrays.
   // Finding the minimum value for your model may require some trial and error.