diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49d8ee810..fc0ec07c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,8 +29,8 @@ include(cmake/ConfigGen.cmake)
 caffe_option(CPU_ONLY  "Build Caffe without CUDA support" OFF) # TODO: rename to USE_CUDA
 caffe_option(USE_OPENMP "Build Caffe with OpenMP support" ON )
 caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ONLY)
-caffe_option(USE_MKL2017_AS_DEFAULT_ENGINE "Use MKL2017 primitives for supported layers" ON)
-caffe_option(USE_MKLDNN_AS_DEFAULT_ENGINE "Use MKL-DNN primitives for supported layers" OFF)
+caffe_option(USE_MKL2017_AS_DEFAULT_ENGINE "Use MKL2017 primitives for supported layers" OFF)
+caffe_option(USE_MKLDNN_AS_DEFAULT_ENGINE "Use MKL-DNN primitives for supported layers" ON)
 caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 caffe_option(BUILD_python "Build Python wrapper" ON)
 set(python_version "2" CACHE STRING "Specify which Python version to use")
@@ -68,6 +68,8 @@ if(UNIX OR APPLE)
     # GCC specific flags.
     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9)
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIE -fstack-protector-strong")
+      # auto enable SGD FUSION if gcc version >= 4.9
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_SGD_FUSION")
     else()
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIE -fstack-protector")
     endif()
@@ -79,6 +81,8 @@ if(UNIX OR APPLE)
     # though it uses -pie linker option that require -fPIE during compilation. Checksec
     # shows that it generates correct PIE anyway if only -pie is provided.
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector")
+    # Auto enable SGD Fusion if use intel compiler
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_SGD_FUSION")
   endif()
   
   # Generic flags.
diff --git a/Makefile b/Makefile
index f7144b7db..0df6c3c4e 100644
--- a/Makefile
+++ b/Makefile
@@ -103,6 +103,7 @@ DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
 DYNAMIC_NAME := $(LIB_BUILD_DIR)/$(DYNAMIC_VERSIONED_NAME_SHORT)
 COMMON_FLAGS += -DCAFFE_VERSION=$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
+COMMON_FLAGS += -std=c++11
 
 ##############################
 # Get all source files
@@ -318,6 +319,11 @@ else ifeq ($(UNAME), Darwin)
 	OSX_MINOR_VERSION := $(shell sw_vers -productVersion | cut -f 2 -d .)
 endif
 
+# Custom compiler
+ifdef CUSTOM_CXX
+	CXX := $(CUSTOM_CXX)
+endif
+
 # Linux
 ifeq ($(LINUX), 1)
 	CXX ?= /usr/bin/g++
@@ -365,19 +371,19 @@ else
 	ORIGIN := \$$ORIGIN
 endif
 
-# Custom compiler
-ifdef CUSTOM_CXX
-	CXX := $(CUSTOM_CXX)
-endif
-
 # Compiler flags
 ifneq (,$(findstring icpc,$(CXX)))
 	CXX_HARDENING_FLAGS += -fstack-protector
+	#Enable SGD FUSION if use intel compiler
+	COMMON_FLAGS += -DENABLE_SGD_FUSION
+
 else ifneq (,$(findstring clang++,$(CXX)))
 	CXX_HARDENING_FLAGS += -fPIE -fstack-protector
 else ifneq (,$(findstring g++,$(CXX)))
-	ifeq ($(shell echo | awk '{exit $(GCCVERSION) >= 4.9;}'), 1)
+	ifeq ($(shell echo | awk '{ print $(GCCVERSION) >= 4.9 }'), 1)
 		CXX_HARDENING_FLAGS += -fPIE -fstack-protector-strong
+		#Enable SGD FUSION if gcc version >= 4.9
+		COMMON_FLAGS += -DENABLE_SGD_FUSION
 	else
 		CXX_HARDENING_FLAGS += -fPIE -fstack-protector
 	endif	
@@ -500,7 +506,7 @@ ifeq ($(MKL_EXTERNAL), 1)
 	MKL_LDFLAGS+=-Wl,-rpath,$(MKLROOT)/lib
 endif
 
-	COMMON_FLAGS += -DUSE_MKL
+	COMMON_FLAGS += -DUSE_MKL -DMKL_ILP64
 	BLAS_INCLUDE ?= $(MKLROOT)/include
 	BLAS_LIB ?= $(MKLROOT)/lib $(MKLROOT)/lib/intel64
 
@@ -547,10 +553,6 @@ LIBRARY_DIRS += $(LIB_BUILD_DIR)
 # Automatic dependency generation (nvcc is handled separately)
 CXXFLAGS += -MMD -MP
 
-##########SGD FUSION#######################
-ifeq ($(ENABLE_SGD_FUSION), 1)
-        COMMON_FLAGS += -DENABLE_SGD_FUSION
-endif
 ###########################################
 #
 # Complete build flags.
@@ -827,6 +829,7 @@ $(PY_PROTO_INIT): | $(PY_PROTO_BUILD_DIR)
 	touch $(PY_PROTO_INIT)
 
 clean: mkldnn_clean
+	@echo "Will download the new version of MKL2017 and MLSL when clean and prepare the environment."
 	@- $(RM) -rf $(ALL_BUILD_DIRS)
 	@- $(RM) -rf $(OTHER_BUILD_DIR)
 	@- $(RM) -rf $(BUILD_DIR_LINK)
diff --git a/Makefile.config.example b/Makefile.config.example
index 539a00a67..d368722be 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -43,13 +43,13 @@
 # CPU-only switch (uncomment to build without GPU support).
 CPU_ONLY := 1
 
-USE_MKL2017_AS_DEFAULT_ENGINE := 1
+# USE_MKL2017_AS_DEFAULT_ENGINE := 1
 # or put this at the top your train_val.protoxt or solver.prototxt file:
 # engine: "MKL2017" 
 # or use this option with caffe tool:
 # -engine "MKL2017"
 
-# USE_MKLDNN_AS_DEFAULT_ENGINE flag is OBSOLETE
+USE_MKLDNN_AS_DEFAULT_ENGINE := 1
 # Put this at the top your train_val.protoxt or solver.prototxt file:
 # engine: "MKLDNN" 
 # or use this option with caffe tool:
@@ -170,8 +170,5 @@ DISTRIBUTE_DIR := distribute
 # The ID of the GPU that 'make runtest' will use to run unit tests.
 TEST_GPUID := 0
 
-# Uncomment for enabling SGD fusion
-# ENABLE_SGD_FUSION := 1
-
 # enable pretty build (comment to see full commands)
 Q ?= @
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 286a42802..143a8f80c 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -145,7 +145,7 @@ macro(caffe_cuda_compile objlist_variable)
   endforeach()
 
   if(UNIX OR APPLE)
-    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
+    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC -std=c++11)
   endif()
 
   if(APPLE)
diff --git a/docker/README.md b/docker/README.md
index 3b8ae1a54..adb893282 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,49 +1,49 @@
-# Building Caffe using standalone Dockerfile
-
-The `standalone` subfolder contains docker files for generating both CPU and GPU executable images for Caffe. The images can be built using make, or by running:
-
-```
-docker build -t caffe:cpu standalone/cpu-ubuntu
-```
-for example. (Here `ubuntu` can be substituted for `centos`, `gpu` can be substituted for `cpu`, but to keep the readme simple, only the `cpu` case will be discussed in detail).
-
-Note that the GPU standalone requires a CUDA 7.5 capable driver to be installed on the system and [nvidia-docker] for running the Docker containers. Here it is generally sufficient to use `nvidia-docker` instead of `docker` in any of the commands mentioned.
-
-# Running Caffe using the docker image
-
-In order to test the Caffe image, run:
-```
-docker run -ti caffe:cpu caffe --version
-```
-which should show a message like:
-```
-caffe version 1.0.0-rc3
-```
-
-One can also build and run the Caffe tests in the image using:
-```
-docker run -ti caffe:cpu bash -c "cd /opt/caffe/build; make runtest"
-```
-
-In order to get the most out of the caffe image, some more advanced `docker run` options could be used. For example, running:
-```
-docker run -ti caffe:cpu caffe time -model /opt/caffe/models/bvlc_alexnet/deploy.prototxt -engine MKLDNN
-```
-will measure the performance of AlexNet. You can also run caffe train as well. Note that docker runs all commands as root by default, and thus any output files (e.g. snapshots) generated will be owned by the root user. In order to ensure that the current user is used instead, the following command can be used:
-```
-docker run -ti --volume=$(pwd):/workspace -u $(id -u):$(id -g) caffe:cpu caffe train --solver=/opt/caffe/models/bvlc_alexnet/solver.prototxt -engine MKLDNN
-```
-where the `-u` Docker command line option runs the commands in the container as the specified user, and the shell command `id` is used to determine the user and group ID of the current user. Note that the Caffe docker images have `/workspace` defined as the default working directory. This can be overridden using the `--workdir=` Docker command line option. Note that you need to prepare dataset before training.
-
-# Other use-cases
-
-Although running the `caffe` command in the docker containers as described above serves many purposes, the container can also be used for more interactive use cases. For example, specifying `bash` as the command instead of `caffe` yields a shell that can be used for interactive tasks. (Since the caffe build requirements are included in the container, this can also be used to build and run local versions of caffe).
-
-Another use case is to run python scripts that depend on `caffe`'s Python modules. Using the `python` command instead of `bash` or `caffe` will allow this, and an interactive interpreter can be started by running:
-```
-docker run -ti caffe:cpu python
-```
-(`ipython` is also available in the container).
-
-Since the `caffe/python` folder is also added to the path, the utility executable scripts defined there can also be used as executables. This includes `draw_net.py`, `classify.py`, and `detect.py`
-
+# Building Caffe using standalone Dockerfile
+
+The `standalone` subfolder contains docker files for generating both CPU and GPU executable images for Caffe. The images can be built using make, or by running:
+
+```
+docker build -t caffe:cpu standalone/cpu-ubuntu
+```
+for example. (Here `ubuntu` can be substituted for `centos`, `gpu` can be substituted for `cpu`, but to keep the readme simple, only the `cpu` case will be discussed in detail).
+
+Note that the GPU standalone requires a CUDA 7.5 capable driver to be installed on the system and [nvidia-docker] for running the Docker containers. Here it is generally sufficient to use `nvidia-docker` instead of `docker` in any of the commands mentioned.
+
+# Running Caffe using the docker image
+
+In order to test the Caffe image, run:
+```
+docker run -ti caffe:cpu caffe --version
+```
+which should show a message like:
+```
+caffe version 1.0.0-rc3
+```
+
+One can also build and run the Caffe tests in the image using:
+```
+docker run -ti caffe:cpu bash -c "cd /opt/caffe/build; make runtest"
+```
+
+In order to get the most out of the caffe image, some more advanced `docker run` options could be used. For example, running:
+```
+docker run -ti caffe:cpu caffe time -model /opt/caffe/models/bvlc_alexnet/deploy.prototxt -engine MKLDNN
+```
+will measure the performance of AlexNet. You can also run caffe train as well. Note that docker runs all commands as root by default, and thus any output files (e.g. snapshots) generated will be owned by the root user. In order to ensure that the current user is used instead, the following command can be used:
+```
+docker run -ti --volume=$(pwd):/workspace -u $(id -u):$(id -g) caffe:cpu caffe train --solver=/opt/caffe/models/bvlc_alexnet/solver.prototxt -engine MKLDNN
+```
+where the `-u` Docker command line option runs the commands in the container as the specified user, and the shell command `id` is used to determine the user and group ID of the current user. Note that the Caffe docker images have `/workspace` defined as the default working directory. This can be overridden using the `--workdir=` Docker command line option. Note that you need to prepare dataset before training.
+
+# Other use-cases
+
+Although running the `caffe` command in the docker containers as described above serves many purposes, the container can also be used for more interactive use cases. For example, specifying `bash` as the command instead of `caffe` yields a shell that can be used for interactive tasks. (Since the caffe build requirements are included in the container, this can also be used to build and run local versions of caffe).
+
+Another use case is to run python scripts that depend on `caffe`'s Python modules. Using the `python` command instead of `bash` or `caffe` will allow this, and an interactive interpreter can be started by running:
+```
+docker run -ti caffe:cpu python
+```
+(`ipython` is also available in the container).
+
+Since the `caffe/python` folder is also added to the path, the utility executable scripts defined there can also be used as executables. This includes `draw_net.py`, `classify.py`, and `detect.py`
+
diff --git a/docker/standalone/cpu-centos/Dockerfile b/docker/standalone/cpu-centos/Dockerfile
index f3ed5e44c..dfa313887 100644
--- a/docker/standalone/cpu-centos/Dockerfile
+++ b/docker/standalone/cpu-centos/Dockerfile
@@ -1,56 +1,56 @@
-FROM centos:7
-MAINTAINER caffe-maint@googlegroups.com
-
-#ENV http_proxy proxy:port
-#ENV https_proxy proxy:port
-
-RUN rpm -iUvh http://download.fedoraproject.org/pub/epel/7/x86_64/e/epel-release-7-8.noarch.rpm
-
-RUN yum install -y \
-        redhat-rpm-config \
-        tar \
-        findutils \
-        make \
-        gcc-c++ \
-        cmake \
-        git \
-        wget \
-        atlas-devel \
-        boost-devel \
-        gflags-devel \
-        glog-devel \
-        hdf5-devel \
-        leveldb-devel \
-        lmdb-devel \
-        opencv-devel \
-        protobuf-devel \
-        snappy-devel \
-        protobuf-compiler \
-        freetype-devel \
-        libpng-devel \
-        python-devel \
-        python-numpy \
-        python-pip \
-        python-scipy \
-        gcc-gfortran \
-        libjpeg-turbo-devel
-
-RUN yum clean all 
-ENV CAFFE_ROOT=/opt/caffe
-WORKDIR $CAFFE_ROOT
-
-# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
-ENV CLONE_TAG=master
-
-RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/intel/caffe.git . && \
-    for req in $(cat python/requirements.txt) pydot; do pip --no-cache-dir install $req; done && \
-    mkdir build && cd build && \
-    cmake -DCPU_ONLY=1 -DCMAKE_BUILD_TYPE=Release .. && \
-    make all -j"$(nproc)"
-
-ENV PYCAFFE_ROOT $CAFFE_ROOT/python
-ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
-ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
-RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
-
-WORKDIR /workspace
+FROM centos:7
+MAINTAINER caffe-maint@googlegroups.com
+
+#ENV http_proxy proxy:port
+#ENV https_proxy proxy:port
+
+RUN rpm -iUvh http://download.fedoraproject.org/pub/epel/7/x86_64/e/epel-release-7-8.noarch.rpm
+
+RUN yum install -y \
+        redhat-rpm-config \
+        tar \
+        findutils \
+        make \
+        gcc-c++ \
+        cmake \
+        git \
+        wget \
+        atlas-devel \
+        boost-devel \
+        gflags-devel \
+        glog-devel \
+        hdf5-devel \
+        leveldb-devel \
+        lmdb-devel \
+        opencv-devel \
+        protobuf-devel \
+        snappy-devel \
+        protobuf-compiler \
+        freetype-devel \
+        libpng-devel \
+        python-devel \
+        python-numpy \
+        python-pip \
+        python-scipy \
+        gcc-gfortran \
+        libjpeg-turbo-devel
+
+RUN yum clean all 
+ENV CAFFE_ROOT=/opt/caffe
+WORKDIR $CAFFE_ROOT
+
+# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
+ENV CLONE_TAG=master
+
+RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/intel/caffe.git . && \
+    for req in $(cat python/requirements.txt) pydot; do pip --no-cache-dir install $req; done && \
+    mkdir build && cd build && \
+    cmake -DCPU_ONLY=1 -DCMAKE_BUILD_TYPE=Release .. && \
+    make all -j"$(nproc)"
+
+ENV PYCAFFE_ROOT $CAFFE_ROOT/python
+ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
+ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
+RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
+
+WORKDIR /workspace
diff --git a/docker/standalone/cpu-ubuntu/Dockerfile b/docker/standalone/cpu-ubuntu/Dockerfile
index 004c7e7c3..cfbb682cb 100644
--- a/docker/standalone/cpu-ubuntu/Dockerfile
+++ b/docker/standalone/cpu-ubuntu/Dockerfile
@@ -1,45 +1,45 @@
-FROM ubuntu:14.04
-MAINTAINER caffe-maint@googlegroups.com
-
-#ENV http_proxy proxy:port
-#ENV https_proxy proxy:port
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cmake \
-        git \
-        wget \
-        libboost-all-dev \
-        libgflags-dev \
-        libgoogle-glog-dev \
-        libhdf5-serial-dev \
-        libleveldb-dev \
-        liblmdb-dev \
-        libopencv-dev \
-        libprotobuf-dev \
-        libsnappy-dev \
-        protobuf-compiler \
-        python-dev \
-        python-numpy \
-        python-pip \
-        python-scipy && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CAFFE_ROOT=/opt/caffe
-WORKDIR $CAFFE_ROOT
-
-# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
-ENV CLONE_TAG=master
-
-RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/intel/caffe.git . && \
-    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
-    mkdir build && cd build && \
-    cmake -DCPU_ONLY=1 -DCMAKE_BUILD_TYPE=Release .. && \
-    make all -j"$(nproc)"
-
-ENV PYCAFFE_ROOT $CAFFE_ROOT/python
-ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
-ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
-RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
-
-WORKDIR /workspace
+FROM ubuntu:14.04
+MAINTAINER caffe-maint@googlegroups.com
+
+#ENV http_proxy proxy:port
+#ENV https_proxy proxy:port
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        git \
+        wget \
+        libboost-all-dev \
+        libgflags-dev \
+        libgoogle-glog-dev \
+        libhdf5-serial-dev \
+        libleveldb-dev \
+        liblmdb-dev \
+        libopencv-dev \
+        libprotobuf-dev \
+        libsnappy-dev \
+        protobuf-compiler \
+        python-dev \
+        python-numpy \
+        python-pip \
+        python-scipy && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CAFFE_ROOT=/opt/caffe
+WORKDIR $CAFFE_ROOT
+
+# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
+ENV CLONE_TAG=master
+
+RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/intel/caffe.git . && \
+    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
+    mkdir build && cd build && \
+    cmake -DCPU_ONLY=1 -DCMAKE_BUILD_TYPE=Release .. && \
+    make all -j"$(nproc)"
+
+ENV PYCAFFE_ROOT $CAFFE_ROOT/python
+ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
+ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
+RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
+
+WORKDIR /workspace
diff --git a/docker/standalone/gpu/Dockerfile b/docker/standalone/gpu/Dockerfile
index 932664599..d69ebea90 100644
--- a/docker/standalone/gpu/Dockerfile
+++ b/docker/standalone/gpu/Dockerfile
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER caffe-maint@googlegroups.com
 
@@ -44,3 +45,51 @@ ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
 RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
 
 WORKDIR /workspace
+=======
+FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
+MAINTAINER caffe-maint@googlegroups.com
+
+#ENV http_proxy proxy:port
+#ENV https_proxy proxy:port
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        git \
+        wget \
+        libatlas-base-dev \
+        libboost-all-dev \
+        libgflags-dev \
+        libgoogle-glog-dev \
+        libhdf5-serial-dev \
+        libleveldb-dev \
+        liblmdb-dev \
+        libopencv-dev \
+        libprotobuf-dev \
+        libsnappy-dev \
+        protobuf-compiler \
+        python-dev \
+        python-numpy \
+        python-pip \
+        python-scipy && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CAFFE_ROOT=/opt/caffe
+WORKDIR $CAFFE_ROOT
+
+# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
+ENV CLONE_TAG=master
+
+RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/intel/caffe.git . && \
+    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
+    mkdir build && cd build && \
+    cmake -DUSE_CUDNN=1 .. && \
+    make -j"$(nproc)"
+
+ENV PYCAFFE_ROOT $CAFFE_ROOT/python
+ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
+ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
+RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
+
+WORKDIR /workspace
+>>>>>>> remotes/internal/release_1.0.5
diff --git a/docker/templates/Dockerfile.template b/docker/templates/Dockerfile.template
index de3c91beb..94710e4e6 100644
--- a/docker/templates/Dockerfile.template
+++ b/docker/templates/Dockerfile.template
@@ -1,45 +1,45 @@
-MAINTAINER caffe-maint@googlegroups.com
-
-#ENV http_proxy proxy:port
-#ENV https_proxy proxy:port
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cmake \
-        git \
-        wget \
-        libatlas-base-dev \
-        libboost-all-dev \
-        libgflags-dev \
-        libgoogle-glog-dev \
-        libhdf5-serial-dev \
-        libleveldb-dev \
-        liblmdb-dev \
-        libopencv-dev \
-        libprotobuf-dev \
-        libsnappy-dev \
-        protobuf-compiler \
-        python-dev \
-        python-numpy \
-        python-pip \
-        python-scipy && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV CAFFE_ROOT=/opt/caffe
-WORKDIR $CAFFE_ROOT
-
-# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
-ENV CLONE_TAG=master
-
-RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
-    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
-    mkdir build && cd build && \
-    cmake ${CMAKE_ARGS} .. && \
-    make -j"$(nproc)"
-
-ENV PYCAFFE_ROOT $CAFFE_ROOT/python
-ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
-ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
-RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
-
-WORKDIR /workspace
+MAINTAINER caffe-maint@googlegroups.com
+
+#ENV http_proxy proxy:port
+#ENV https_proxy proxy:port
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        git \
+        wget \
+        libatlas-base-dev \
+        libboost-all-dev \
+        libgflags-dev \
+        libgoogle-glog-dev \
+        libhdf5-serial-dev \
+        libleveldb-dev \
+        liblmdb-dev \
+        libopencv-dev \
+        libprotobuf-dev \
+        libsnappy-dev \
+        protobuf-compiler \
+        python-dev \
+        python-numpy \
+        python-pip \
+        python-scipy && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CAFFE_ROOT=/opt/caffe
+WORKDIR $CAFFE_ROOT
+
+# FIXME: clone a specific git tag and use ARG instead of ENV once DockerHub supports this.
+ENV CLONE_TAG=master
+
+RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
+    for req in $(cat python/requirements.txt) pydot; do pip install $req; done && \
+    mkdir build && cd build && \
+    cmake ${CMAKE_ARGS} .. && \
+    make -j"$(nproc)"
+
+ENV PYCAFFE_ROOT $CAFFE_ROOT/python
+ENV PYTHONPATH $PYCAFFE_ROOT:$PYTHONPATH
+ENV PATH $CAFFE_ROOT/build/tools:$PYCAFFE_ROOT:$PATH
+RUN echo "$CAFFE_ROOT/build/lib" >> /etc/ld.so.conf.d/caffe.conf && ldconfig
+
+WORKDIR /workspace
diff --git a/examples/faster-rcnn/README.md b/examples/faster-rcnn/README.md
new file mode 100755
index 000000000..2aa94a744
--- /dev/null
+++ b/examples/faster-rcnn/README.md
@@ -0,0 +1,194 @@
+### Disclaimer
+
+This is the CPU only version of Faster-RCNN.
+
+The official Faster R-CNN code (written in MATLAB) is available [here](https://github.com/ShaoqingRen/faster_rcnn).
+If your goal is to reproduce the results in our NIPS 2015 paper, please use the [official code](https://github.com/ShaoqingRen/faster_rcnn).
+
+This repository contains a Python *reimplementation* of the MATLAB code.
+This Python implementation is built on a fork of [Fast R-CNN](https://github.com/rbgirshick/fast-rcnn).
+There are slight differences between the two implementations.
+In particular, this Python port
+ - is ~10% slower at test-time, because some operations execute on the CPU in Python layers (e.g., 220ms / image vs. 200ms / image for VGG16)
+ - gives similar, but not exactly the same, mAP as the MATLAB version
+ - is *not compatible* with models trained using the MATLAB code due to the minor implementation differences
+ - **includes approximate joint training** that is 1.5x faster than alternating optimization (for VGG16) -- see these [slides](https://www.dropbox.com/s/xtr4yd4i5e0vw8g/iccv15_tutorial_training_rbg.pdf?dl=0) for more information
+
+# *Faster* R-CNN: Towards Real-Time Object Detection with Region Proposal Networks
+
+By Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun (Microsoft Research)
+
+This Python implementation contains contributions from Sean Bell (Cornell) written during an MSR internship.
+
+Please see the official [README.md](https://github.com/ShaoqingRen/faster_rcnn/blob/master/README.md) for more details.
+
+Faster R-CNN was initially described in an [arXiv tech report](http://arxiv.org/abs/1506.01497) and was subsequently published in NIPS 2015.
+
+### License
+
+Faster R-CNN is released under the MIT License (refer to the LICENSE file for details).
+
+### Citing Faster R-CNN
+
+If you find Faster R-CNN useful in your research, please consider citing:
+
+    @inproceedings{renNIPS15fasterrcnn,
+        Author = {Shaoqing Ren and Kaiming He and Ross Girshick and Jian Sun},
+        Title = {Faster {R-CNN}: Towards Real-Time Object Detection
+                 with Region Proposal Networks},
+        Booktitle = {Advances in Neural Information Processing Systems ({NIPS})},
+        Year = {2015}
+    }
+
+### Contents
+1. [Requirements: software](#requirements-software)
+2. [Requirements: hardware](#requirements-hardware)
+3. [Basic installation](#installation-sufficient-for-the-demo)
+4. [Demo](#demo)
+5. [Beyond the demo: training and testing](#beyond-the-demo-installation-for-training-and-testing-models)
+6. [Usage](#usage)
+
+### Requirements: software
+
+1. Python packages you might have: `cython`, `python-opencv`, `easydict`
+2. [Optional] MATLAB is required for **official** PASCAL VOC evaluation only. The code now includes unofficial Python evaluation code.
+
+### Requirements: hardware
+
+1. For training smaller networks (ZF, VGG_CNN_M_1024) a good GPU (e.g., Titan, K20, K40, ...) with at least 3G of memory suffices
+2. For training Fast R-CNN with VGG16, you'll need a K40 (~11G of memory)
+3. For training the end-to-end version of Faster R-CNN with VGG16, 3G of GPU memory is sufficient (using CUDNN)
+
+### Installation (sufficient for the demo)
+
+1. Clone the latest intel caffe repository
+  ```Shell
+  git clone --recursive https://github.com/intel/caffe.git
+  ```
+
+2. Build the Cython modules
+    ```Shell
+    cd $CAFFE_ROOT/examples/faster-rcnn/lib
+    make
+    ```
+
+4. Build Caffe and pycaffe
+    ```Shell
+    cd $CAFFE_ROOT/
+    mkdir build
+    cd build
+    cmake .. -DCPU_ONLY=ON
+    make -j
+    ```
+
+5. Download pre-computed Faster R-CNN detectors
+    ```Shell
+    cd $CAFFE_ROOT/examples/faster-rcnn/
+    ./data/scripts/fetch_faster_rcnn_models.sh
+    ```
+
+    This will populate the `$CAFFE_ROOT/examples/faster-rcnn/data` folder with `faster_rcnn_models`. See `data/README.md` for details.
+    These models were trained on VOC 2007 trainval.
+
+### Demo
+
+*After successfully completing [basic installation](#installation-sufficient-for-the-demo)*, you'll be ready to run the demo.
+
+To run the demo
+```Shell
+cd $CAFFE_ROOT/examples/faster-rcnn/
+./tools/demo.py
+```
+The demo performs detection using a VGG16 network trained for detection on PASCAL VOC 2007.
+
+### Beyond the demo: installation for training and testing models
+1. Download the training, validation, test data and VOCdevkit
+
+	```Shell
+	wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+	wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+	wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar
+	```
+
+2. Extract all of these tars into one directory named `VOCdevkit`
+
+	```Shell
+	tar xvf VOCtrainval_06-Nov-2007.tar
+	tar xvf VOCtest_06-Nov-2007.tar
+	tar xvf VOCdevkit_08-Jun-2007.tar
+	```
+
+3. It should have this basic structure
+
+	```Shell
+  	$VOCdevkit/                           # development kit
+  	$VOCdevkit/VOCcode/                   # VOC utility code
+  	$VOCdevkit/VOC2007                    # image sets, annotations, etc.
+  	# ... and several other directories ...
+  	```
+
+4. Create symlinks for the PASCAL VOC dataset
+
+	```Shell
+    cd $CAFFE_ROOT/examples/faster-rcnn/data
+    ln -s $VOCdevkit VOCdevkit2007
+    ```
+    Using symlinks is a good idea because you will likely want to share the same PASCAL dataset installation between multiple projects.
+5. [Optional] follow similar steps to get PASCAL VOC 2010 and 2012
+6. [Optional] If you want to use COCO, please see some notes under `data/README.md`
+7. Follow the next sections to download pre-trained ImageNet models
+
+### Download pre-trained ImageNet models
+
+Pre-trained ImageNet models can be downloaded for the three networks described in the paper: ZF and VGG16.
+
+```Shell
+cd $CAFFE_ROOT/examples/faster-rcnn/
+./data/scripts/fetch_imagenet_models.sh
+```
+VGG16 comes from the [Caffe Model Zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo), but is provided here for your convenience.
+ZF was trained at MSRA.
+
+### Usage
+
+To train and test a Faster R-CNN detector using the **alternating optimization** algorithm from our NIPS 2015 paper, use `experiments/scripts/faster_rcnn_alt_opt.sh`.
+Output is written underneath `$FRCN_ROOT/output`.
+
+```Shell
+cd $CAFFE_ROOT/examples/faster-rcnn/
+./experiments/scripts/faster_rcnn_alt_opt.sh [NET] [--set ...]
+# we only support training on CPU.
+# NET in {ZF, VGG_CNN_M_1024, VGG16} is the network arch to use
+# --set ... allows you to specify fast_rcnn.config options, e.g.
+#   --set EXP_DIR seed_rng1701 RNG_SEED 1701
+```
+
+("alt opt" refers to the alternating optimization training algorithm described in the NIPS paper.)
+
+To train and test a Faster R-CNN detector using the **approximate joint training** method, use `experiments/scripts/faster_rcnn_end2end.sh`.
+Output is written underneath `$FRCN_ROOT/output`.
+
+```Shell
+cd $CAFFE_ROOT/examples/faster-rcnn/
+./experiments/scripts/faster_rcnn_end2end.sh [NET] [--set ...]
+# we only support training on CPU.
+# NET in {ZF, VGG_CNN_M_1024, VGG16} is the network arch to use
+# --set ... allows you to specify fast_rcnn.config options, e.g.
+#   --set EXP_DIR seed_rng1701 RNG_SEED 1701
+```
+
+This method trains the RPN module jointly with the Fast R-CNN network, rather than alternating between training the two. It results in faster (~ 1.5x speedup) training times and similar detection accuracy. See these [slides](https://www.dropbox.com/s/xtr4yd4i5e0vw8g/iccv15_tutorial_training_rbg.pdf?dl=0) for more details.
+
+Artifacts generated by the scripts in `tools` are written in this directory.
+
+Trained Fast R-CNN networks are saved under:
+
+```
+output/<experiment directory>/<dataset name>/
+```
+
+Test outputs are saved under:
+
+```
+output/<experiment directory>/<dataset name>/<network snapshot name>/
+```
diff --git a/examples/faster-rcnn/data/README.md b/examples/faster-rcnn/data/README.md
new file mode 100644
index 000000000..2f01d3d71
--- /dev/null
+++ b/examples/faster-rcnn/data/README.md
@@ -0,0 +1,69 @@
+This directory holds (*after you download them*):
+- Caffe models pre-trained on ImageNet
+- Faster R-CNN models
+- Symlinks to datasets
+
+To download Caffe models (ZF, VGG16) pre-trained on ImageNet, run:
+
+```
+./data/scripts/fetch_imagenet_models.sh
+```
+
+This script will populate `data/imagenet_models`.
+
+To download Faster R-CNN models trained on VOC 2007, run:
+
+```
+./data/scripts/fetch_faster_rcnn_models.sh
+```
+
+This script will populate `data/faster_rcnn_models`.
+
+In order to train and test with PASCAL VOC, you will need to establish symlinks.
+From the `data` directory (`cd data`):
+
+```
+# For VOC 2007
+ln -s /your/path/to/VOC2007/VOCdevkit VOCdevkit2007
+
+# For VOC 2012
+ln -s /your/path/to/VOC2012/VOCdevkit VOCdevkit2012
+```
+
+Install the MS COCO dataset at /path/to/coco
+
+```
+ln -s /path/to/coco coco
+```
+
+For COCO with Fast R-CNN, place object proposals under `coco_proposals` (inside
+the `data` directory). You can obtain proposals on COCO from Jan Hosang at
+https://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-computing/research/object-recognition-and-scene-understanding/how-good-are-detection-proposals-really/.
+For COCO, using MCG is recommended over selective search. MCG boxes can be downloaded
+from http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/.
+Use the tool `lib/datasets/tools/mcg_munge.py` to convert the downloaded MCG data
+into the same file layout as those from Jan Hosang.
+
+Since you'll likely be experimenting with multiple installs of Fast/er R-CNN in
+parallel, you'll probably want to keep all of this data in a shared place and
+use symlinks. On my system I create the following symlinks inside `data`:
+
+Annotations for the 5k image 'minival' subset of COCO val2014 that I like to use
+can be found at https://dl.dropboxusercontent.com/s/o43o90bna78omob/instances_minival2014.json.zip?dl=0.
+Annotations for COCO val2014 (set) minus minival (~35k images) can be found at
+https://dl.dropboxusercontent.com/s/s3tw5zcg7395368/instances_valminusminival2014.json.zip?dl=0.
+
+```
+# data/cache holds various outputs created by the datasets package
+ln -s /data/fast_rcnn_shared/cache
+
+# move the imagenet_models to shared location and symlink to them
+ln -s /data/fast_rcnn_shared/imagenet_models
+
+# move the selective search data to a shared location and symlink to them
+# (only applicable to Fast R-CNN training)
+ln -s /data/fast_rcnn_shared/selective_search_data
+
+ln -s /data/VOC2007/VOCdevkit VOCdevkit2007
+ln -s /data/VOC2012/VOCdevkit VOCdevkit2012
+```
diff --git a/examples/faster-rcnn/data/demo/000456.jpg b/examples/faster-rcnn/data/demo/000456.jpg
new file mode 100644
index 000000000..480afa5d5
Binary files /dev/null and b/examples/faster-rcnn/data/demo/000456.jpg differ
diff --git a/examples/faster-rcnn/data/demo/000542.jpg b/examples/faster-rcnn/data/demo/000542.jpg
new file mode 100644
index 000000000..d87e91929
Binary files /dev/null and b/examples/faster-rcnn/data/demo/000542.jpg differ
diff --git a/examples/faster-rcnn/data/demo/001150.jpg b/examples/faster-rcnn/data/demo/001150.jpg
new file mode 100644
index 000000000..e8c86a499
Binary files /dev/null and b/examples/faster-rcnn/data/demo/001150.jpg differ
diff --git a/examples/faster-rcnn/data/demo/001763.jpg b/examples/faster-rcnn/data/demo/001763.jpg
new file mode 100644
index 000000000..c80ebd9b0
Binary files /dev/null and b/examples/faster-rcnn/data/demo/001763.jpg differ
diff --git a/examples/faster-rcnn/data/demo/004545.jpg b/examples/faster-rcnn/data/demo/004545.jpg
new file mode 100644
index 000000000..4e06c202b
Binary files /dev/null and b/examples/faster-rcnn/data/demo/004545.jpg differ
diff --git a/examples/faster-rcnn/data/scripts/fetch_faster_rcnn_models.sh b/examples/faster-rcnn/data/scripts/fetch_faster_rcnn_models.sh
new file mode 100755
index 000000000..9b76fd56d
--- /dev/null
+++ b/examples/faster-rcnn/data/scripts/fetch_faster_rcnn_models.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )"
+cd $DIR
+
+FILE=faster_rcnn_models.tgz
+URL=https://dl.dropboxusercontent.com/s/o6ii098bu51d139/faster_rcnn_models.tgz?dl=0
+CHECKSUM=ac116844f66aefe29587214272054668
+
+if [ -f $FILE ]; then
+  echo "File already exists. Checking md5..."
+  os=`uname -s`
+  if [ "$os" = "Linux" ]; then
+    checksum=`md5sum $FILE | awk '{ print $1 }'`
+  elif [ "$os" = "Darwin" ]; then
+    checksum=`cat $FILE | md5`
+  fi
+  if [ "$checksum" = "$CHECKSUM" ]; then
+    echo "Checksum is correct. No need to download."
+    exit 0
+  else
+    echo "Checksum is incorrect. Need to download again."
+  fi
+fi
+
+echo "Downloading Faster R-CNN demo models (695M)..."
+
+wget $URL -O $FILE
+
+echo "Unzipping..."
+
+tar zxvf $FILE
+
+echo "Done. Please run this command again to verify that checksum = $CHECKSUM."
diff --git a/examples/faster-rcnn/data/scripts/fetch_imagenet_models.sh b/examples/faster-rcnn/data/scripts/fetch_imagenet_models.sh
new file mode 100755
index 000000000..46835b9fb
--- /dev/null
+++ b/examples/faster-rcnn/data/scripts/fetch_imagenet_models.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )"
+cd $DIR
+
+FILE=imagenet_models.tgz
+URL=https://dl.dropbox.com/s/gstw7122padlf0l/imagenet_models.tgz?dl=0
+CHECKSUM=ed34ca912d6782edfb673a8c3a0bda6d
+
+if [ -f $FILE ]; then
+  echo "File already exists. Checking md5..."
+  os=`uname -s`
+  if [ "$os" = "Linux" ]; then
+    checksum=`md5sum $FILE | awk '{ print $1 }'`
+  elif [ "$os" = "Darwin" ]; then
+    checksum=`cat $FILE | md5`
+  fi
+  if [ "$checksum" = "$CHECKSUM" ]; then
+    echo "Checksum is correct. No need to download."
+    exit 0
+  else
+    echo "Checksum is incorrect. Need to download again."
+  fi
+fi
+
+echo "Downloading pretrained ImageNet models (1G)..."
+
+wget $URL -O $FILE
+
+echo "Unzipping..."
+
+tar zxvf $FILE
+
+echo "Done. Please run this command again to verify that checksum = $CHECKSUM."
diff --git a/examples/faster-rcnn/data/scripts/fetch_selective_search_data.sh b/examples/faster-rcnn/data/scripts/fetch_selective_search_data.sh
new file mode 100755
index 000000000..bf925fc5b
--- /dev/null
+++ b/examples/faster-rcnn/data/scripts/fetch_selective_search_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )"
+cd $DIR
+
+FILE=selective_search_data.tgz
+URL=https://dl.dropboxusercontent.com/s/orrt7o6bp6ae0tc/selective_search_data.tgz?dl=0
+CHECKSUM=7078c1db87a7851b31966b96774cd9b9
+
+if [ -f $FILE ]; then
+  echo "File already exists. Checking md5..."
+  os=`uname -s`
+  if [ "$os" = "Linux" ]; then
+    checksum=`md5sum $FILE | awk '{ print $1 }'`
+  elif [ "$os" = "Darwin" ]; then
+    checksum=`cat $FILE | md5`
+  fi
+  if [ "$checksum" = "$CHECKSUM" ]; then
+    echo "Checksum is correct. No need to download."
+    exit 0
+  else
+    echo "Checksum is incorrect. Need to download again."
+  fi
+fi
+
+echo "Downloading precomputed selective search boxes (0.5G)..."
+
+wget $URL -O $FILE
+
+echo "Unzipping..."
+
+tar zxvf $FILE
+
+echo "Done. Please run this command again to verify that checksum = $CHECKSUM."
diff --git a/examples/faster-rcnn/experiments/README.md b/examples/faster-rcnn/experiments/README.md
new file mode 100644
index 000000000..076257dac
--- /dev/null
+++ b/examples/faster-rcnn/experiments/README.md
@@ -0,0 +1,5 @@
+Scripts are under `experiments/scripts`.
+
+Each script saves a log file under `experiments/logs`.
+
+Configuration override files used in the experiments are stored in `experiments/cfgs`.
diff --git a/examples/faster-rcnn/experiments/cfgs/faster_rcnn_alt_opt.yml b/examples/faster-rcnn/experiments/cfgs/faster_rcnn_alt_opt.yml
new file mode 100644
index 000000000..9f11bd60d
--- /dev/null
+++ b/examples/faster-rcnn/experiments/cfgs/faster_rcnn_alt_opt.yml
@@ -0,0 +1,5 @@
+EXP_DIR: faster_rcnn_alt_opt
+TRAIN:
+  BG_THRESH_LO: 0.0
+TEST:
+  HAS_RPN: True
diff --git a/examples/faster-rcnn/experiments/cfgs/faster_rcnn_end2end.yml b/examples/faster-rcnn/experiments/cfgs/faster_rcnn_end2end.yml
new file mode 100644
index 000000000..4c6a0545d
--- /dev/null
+++ b/examples/faster-rcnn/experiments/cfgs/faster_rcnn_end2end.yml
@@ -0,0 +1,11 @@
+EXP_DIR: faster_rcnn_end2end
+TRAIN:
+  HAS_RPN: True
+  IMS_PER_BATCH: 1
+  BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True
+  RPN_POSITIVE_OVERLAP: 0.7
+  RPN_BATCHSIZE: 256
+  PROPOSAL_METHOD: gt
+  BG_THRESH_LO: 0.0
+TEST:
+  HAS_RPN: True
diff --git a/examples/faster-rcnn/experiments/scripts/fast_rcnn.sh b/examples/faster-rcnn/experiments/scripts/fast_rcnn.sh
new file mode 100755
index 000000000..26f91fdc1
--- /dev/null
+++ b/examples/faster-rcnn/experiments/scripts/fast_rcnn.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Usage:
+# ./experiments/scripts/fast_rcnn.sh NET DATASET [options args to {train,test}_net.py]
+# DATASET is either pascal_voc or coco.
+#
+# Example:
+# ./experiments/scripts/fast_rcnn.sh VGG_CNN_M_1024 pascal_voc \
+#   --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]"
+
+set -x
+set -e
+
+export PYTHONUNBUFFERED="True"
+
+NET=$1
+NET_lc=${NET,,}
+DATASET=$2
+
+array=( $@ )
+len=${#array[@]}
+EXTRA_ARGS=${array[@]:2:$len}
+EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_}
+
+case $DATASET in
+  pascal_voc)
+    TRAIN_IMDB="voc_2007_trainval"
+    TEST_IMDB="voc_2007_test"
+    PT_DIR="pascal_voc"
+    ITERS=40000
+    ;;
+  coco)
+    TRAIN_IMDB="coco_2014_train"
+    TEST_IMDB="coco_2014_minival"
+    PT_DIR="coco"
+    ITERS=280000
+    ;;
+  *)
+    echo "No dataset given"
+    exit
+    ;;
+esac
+
+LOG="experiments/logs/fast_rcnn_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
+
+if [[ ! -e "experiments/logs" ]]; then
+    mkdir experiments/logs
+fi
+
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+
+time ./tools/train_net.py \
+  --solver ../../models/intel_optimized_models/faster-rcnn/${PT_DIR}/${NET}/fast_rcnn/solver.prototxt \
+  --weights data/imagenet_models/${NET}.v2.caffemodel \
+  --imdb ${TRAIN_IMDB} \
+  --iters ${ITERS} \
+  ${EXTRA_ARGS}
+
+set +x
+NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'`
+set -x
+
+time ./tools/test_net.py \
+  --def ../../models/intel_optimized_models/faster-rcnn/${PT_DIR}/${NET}/fast_rcnn/test.prototxt \
+  --net ${NET_FINAL} \
+  --imdb ${TEST_IMDB} \
+  ${EXTRA_ARGS}
diff --git a/examples/faster-rcnn/experiments/scripts/faster_rcnn_alt_opt.sh b/examples/faster-rcnn/experiments/scripts/faster_rcnn_alt_opt.sh
new file mode 100755
index 000000000..95717e7db
--- /dev/null
+++ b/examples/faster-rcnn/experiments/scripts/faster_rcnn_alt_opt.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Usage:
+# ./experiments/scripts/faster_rcnn_alt_opt.sh NET DATASET [options args to {train,test}_net.py]
+# DATASET is only pascal_voc for now
+#
+# Example:
+# ./experiments/scripts/faster_rcnn_alt_opt.sh VGG_CNN_M_1024 pascal_voc \
+#   --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]"
+
+set -x
+set -e
+
+export PYTHONUNBUFFERED="True"
+
+NET=$1
+NET_lc=${NET,,}
+DATASET=$2
+
+array=( $@ )
+len=${#array[@]}
+EXTRA_ARGS=${array[@]:2:$len}
+EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_}
+
+case $DATASET in
+  pascal_voc)
+    TRAIN_IMDB="voc_2007_trainval"
+    TEST_IMDB="voc_2007_test"
+    PT_DIR="pascal_voc"
+    ITERS=40000
+    ;;
+  coco)
+    echo "Not implemented: use experiments/scripts/faster_rcnn_end2end.sh for coco"
+    exit
+    ;;
+  *)
+    echo "No dataset given"
+    exit
+    ;;
+esac
+
+LOG="experiments/logs/faster_rcnn_alt_opt_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
+
+if [[ ! -e "experiments/logs" ]]; then
+    mkdir experiments/logs
+fi
+
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+
+time ./tools/train_faster_rcnn_alt_opt.py \
+  --net_name ${NET} \
+  --weights data/imagenet_models/${NET}.v2.caffemodel \
+  --imdb ${TRAIN_IMDB} \
+  --cfg experiments/cfgs/faster_rcnn_alt_opt.yml \
+  ${EXTRA_ARGS}
+
+set +x
+NET_FINAL=`grep "Final model:" ${LOG} | awk '{print $3}'`
+set -x
+
+time ./tools/test_net.py \
+  --def ../../models/intel_optimized_models/faster-rcnn/${PT_DIR}/${NET}/faster_rcnn_alt_opt/faster_rcnn_test.pt \
+  --net ${NET_FINAL} \
+  --imdb ${TEST_IMDB} \
+  --cfg experiments/cfgs/faster_rcnn_alt_opt.yml \
+  ${EXTRA_ARGS}
diff --git a/examples/faster-rcnn/experiments/scripts/faster_rcnn_end2end.sh b/examples/faster-rcnn/experiments/scripts/faster_rcnn_end2end.sh
new file mode 100755
index 000000000..ba3e55e16
--- /dev/null
+++ b/examples/faster-rcnn/experiments/scripts/faster_rcnn_end2end.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# Usage:
+# ./experiments/scripts/faster_rcnn_end2end.sh NET DATASET [options args to {train,test}_net.py]
+# DATASET is either pascal_voc or coco.
+#
+# Example:
+# ./experiments/scripts/faster_rcnn_end2end.sh VGG_CNN_M_1024 pascal_voc \
+#   --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]"
+
+set -x
+set -e
+
+export PYTHONUNBUFFERED="True"
+
+NET=$1
+NET_lc=${NET,,}
+DATASET=$2
+
+array=( $@ )
+len=${#array[@]}
+EXTRA_ARGS=${array[@]:2:$len}
+EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_}
+
+case $DATASET in
+  pascal_voc)
+    TRAIN_IMDB="voc_2007_trainval"
+    TEST_IMDB="voc_2007_test"
+    PT_DIR="pascal_voc"
+    ITERS=70000
+    ;;
+  coco)
+    # This is a very long and slow training schedule
+    # You can probably use fewer iterations and reduce the
+    # time to the LR drop (set in the solver to 350,000 iterations).
+    TRAIN_IMDB="coco_2014_train"
+    TEST_IMDB="coco_2014_minival"
+    PT_DIR="coco"
+    ITERS=490000
+    ;;
+  *)
+    echo "No dataset given"
+    exit
+    ;;
+esac
+
+LOG="experiments/logs/faster_rcnn_end2end_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
+
+if [[ ! -e "experiments/logs" ]]; then
+    mkdir experiments/logs
+fi
+
+exec &> >(tee -a "$LOG")
+echo Logging output to "$LOG"
+
+time ./tools/train_net.py \
+  --solver ../../models/intel_optimized_models/faster-rcnn/${PT_DIR}/${NET}/faster_rcnn_end2end/solver.prototxt \
+  --weights data/imagenet_models/${NET}.v2.caffemodel \
+  --imdb ${TRAIN_IMDB} \
+  --iters ${ITERS} \
+  --cfg experiments/cfgs/faster_rcnn_end2end.yml \
+  ${EXTRA_ARGS}
+
+set +x
+NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'`
+set -x
+
+time ./tools/test_net.py \
+  --def ../../models/intel_optimized_models/faster-rcnn/${PT_DIR}/${NET}/faster_rcnn_end2end/test.prototxt \
+  --net ${NET_FINAL} \
+  --imdb ${TEST_IMDB} \
+  --cfg experiments/cfgs/faster_rcnn_end2end.yml \
+  ${EXTRA_ARGS}
diff --git a/examples/faster-rcnn/lib/Makefile b/examples/faster-rcnn/lib/Makefile
new file mode 100644
index 000000000..a48239850
--- /dev/null
+++ b/examples/faster-rcnn/lib/Makefile
@@ -0,0 +1,3 @@
+all:
+	python setup.py build_ext --inplace
+	rm -rf build
diff --git a/examples/faster-rcnn/lib/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m b/examples/faster-rcnn/lib/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m
new file mode 100644
index 000000000..629597a1f
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m
@@ -0,0 +1,14 @@
+function VOCopts = get_voc_opts(path)
+
+tmp = pwd;
+cd(path);
+try
+  addpath('VOCcode');
+  VOCinit;
+catch
+  rmpath('VOCcode');
+  cd(tmp);
+  error(sprintf('VOCcode directory not found under %s', path));
+end
+rmpath('VOCcode');
+cd(tmp);
diff --git a/examples/faster-rcnn/lib/datasets/VOCdevkit-matlab-wrapper/voc_eval.m b/examples/faster-rcnn/lib/datasets/VOCdevkit-matlab-wrapper/voc_eval.m
new file mode 100644
index 000000000..1911a0e39
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/VOCdevkit-matlab-wrapper/voc_eval.m
@@ -0,0 +1,56 @@
+function res = voc_eval(path, comp_id, test_set, output_dir)
+
+VOCopts = get_voc_opts(path);
+VOCopts.testset = test_set;
+
+for i = 1:length(VOCopts.classes)
+  cls = VOCopts.classes{i};
+  res(i) = voc_eval_cls(cls, VOCopts, comp_id, output_dir);
+end
+
+fprintf('\n~~~~~~~~~~~~~~~~~~~~\n');
+fprintf('Results:\n');
+aps = [res(:).ap]';
+fprintf('%.1f\n', aps * 100);
+fprintf('%.1f\n', mean(aps) * 100);
+fprintf('~~~~~~~~~~~~~~~~~~~~\n');
+
+function res = voc_eval_cls(cls, VOCopts, comp_id, output_dir)
+
+test_set = VOCopts.testset;
+year = VOCopts.dataset(4:end);
+
+addpath(fullfile(VOCopts.datadir, 'VOCcode'));
+
+res_fn = sprintf(VOCopts.detrespath, comp_id, cls);
+
+recall = [];
+prec = [];
+ap = 0;
+ap_auc = 0;
+
+do_eval = (str2num(year) <= 2007) | ~strcmp(test_set, 'test');
+if do_eval
+  % Bug in VOCevaldet requires that tic has been called first
+  tic;
+  [recall, prec, ap] = VOCevaldet(VOCopts, comp_id, cls, true);
+  ap_auc = xVOCap(recall, prec);
+
+  % force plot limits
+  ylim([0 1]);
+  xlim([0 1]);
+
+  print(gcf, '-djpeg', '-r0', ...
+        [output_dir '/' cls '_pr.jpg']);
+end
+fprintf('!!! %s : %.4f %.4f\n', cls, ap, ap_auc);
+
+res.recall = recall;
+res.prec = prec;
+res.ap = ap;
+res.ap_auc = ap_auc;
+
+save([output_dir '/' cls '_pr.mat'], ...
+     'res', 'recall', 'prec', 'ap', 'ap_auc');
+
+rmpath(fullfile(VOCopts.datadir, 'VOCcode'));
diff --git a/examples/faster-rcnn/lib/datasets/VOCdevkit-matlab-wrapper/xVOCap.m b/examples/faster-rcnn/lib/datasets/VOCdevkit-matlab-wrapper/xVOCap.m
new file mode 100644
index 000000000..de6c6283c
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/VOCdevkit-matlab-wrapper/xVOCap.m
@@ -0,0 +1,10 @@
+function ap = xVOCap(rec,prec)
+% From the PASCAL VOC 2011 devkit
+
+mrec=[0 ; rec ; 1];
+mpre=[0 ; prec ; 0];
+for i=numel(mpre)-1:-1:1
+    mpre(i)=max(mpre(i),mpre(i+1));
+end
+i=find(mrec(2:end)~=mrec(1:end-1))+1;
+ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
diff --git a/examples/faster-rcnn/lib/datasets/__init__.py b/examples/faster-rcnn/lib/datasets/__init__.py
new file mode 100644
index 000000000..7ba6a65c5
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/__init__.py
@@ -0,0 +1,6 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
diff --git a/examples/faster-rcnn/lib/datasets/coco.py b/examples/faster-rcnn/lib/datasets/coco.py
new file mode 100644
index 000000000..bfe8ff3d7
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/coco.py
@@ -0,0 +1,394 @@
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+from datasets.imdb import imdb
+import datasets.ds_utils as ds_utils
+from fast_rcnn.config import cfg
+import os.path as osp
+import sys
+import os
+import numpy as np
+import scipy.sparse
+import scipy.io as sio
+import cPickle
+import json
+import uuid
+# COCO API
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from pycocotools import mask as COCOmask
+
+def _filter_crowd_proposals(roidb, crowd_thresh):
+    """
+    Finds proposals that are inside crowd regions and marks them with
+    overlap = -1 (for all gt rois), which means they will be excluded from
+    training.
+    """
+    for ix, entry in enumerate(roidb):
+        overlaps = entry['gt_overlaps'].toarray()
+        crowd_inds = np.where(overlaps.max(axis=1) == -1)[0]
+        non_gt_inds = np.where(entry['gt_classes'] == 0)[0]
+        if len(crowd_inds) == 0 or len(non_gt_inds) == 0:
+            continue
+        iscrowd = [int(True) for _ in xrange(len(crowd_inds))]
+        crowd_boxes = ds_utils.xyxy_to_xywh(entry['boxes'][crowd_inds, :])
+        non_gt_boxes = ds_utils.xyxy_to_xywh(entry['boxes'][non_gt_inds, :])
+        ious = COCOmask.iou(non_gt_boxes, crowd_boxes, iscrowd)
+        bad_inds = np.where(ious.max(axis=1) > crowd_thresh)[0]
+        overlaps[non_gt_inds[bad_inds], :] = -1
+        roidb[ix]['gt_overlaps'] = scipy.sparse.csr_matrix(overlaps)
+    return roidb
+
+class coco(imdb):
+    def __init__(self, image_set, year):
+        imdb.__init__(self, 'coco_' + year + '_' + image_set)
+        # COCO specific config options
+        self.config = {'top_k' : 2000,
+                       'use_salt' : True,
+                       'cleanup' : True,
+                       'crowd_thresh' : 0.7,
+                       'min_size' : 2}
+        # name, paths
+        self._year = year
+        self._image_set = image_set
+        self._data_path = osp.join(cfg.DATA_DIR, 'coco')
+        # load COCO API, classes, class <-> id mappings
+        self._COCO = COCO(self._get_ann_file())
+        cats = self._COCO.loadCats(self._COCO.getCatIds())
+        self._classes = tuple(['__background__'] + [c['name'] for c in cats])
+        self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes)))
+        self._class_to_coco_cat_id = dict(zip([c['name'] for c in cats],
+                                              self._COCO.getCatIds()))
+        self._image_index = self._load_image_set_index()
+        # Default to roidb handler
+        self.set_proposal_method('selective_search')
+        self.competition_mode(False)
+
+        # Some image sets are "views" (i.e. subsets) into others.
+        # For example, minival2014 is a random 5000 image subset of val2014.
+        # This mapping tells us where the view's images and proposals come from.
+        self._view_map = {
+            'minival2014' : 'val2014',          # 5k val2014 subset
+            'valminusminival2014' : 'val2014',  # val2014 \setminus minival2014
+        }
+        coco_name = image_set + year  # e.g., "val2014"
+        self._data_name = (self._view_map[coco_name]
+                           if self._view_map.has_key(coco_name)
+                           else coco_name)
+        # Dataset splits that have ground-truth annotations (test splits
+        # do not have gt annotations)
+        self._gt_splits = ('train', 'val', 'minival')
+
+    def _get_ann_file(self):
+        prefix = 'instances' if self._image_set.find('test') == -1 \
+                             else 'image_info'
+        return osp.join(self._data_path, 'annotations',
+                        prefix + '_' + self._image_set + self._year + '.json')
+
+    def _load_image_set_index(self):
+        """
+        Load image ids.
+        """
+        image_ids = self._COCO.getImgIds()
+        return image_ids
+
+    def _get_widths(self):
+        anns = self._COCO.loadImgs(self._image_index)
+        widths = [ann['width'] for ann in anns]
+        return widths
+
+    def image_path_at(self, i):
+        """
+        Return the absolute path to image i in the image sequence.
+        """
+        return self.image_path_from_index(self._image_index[i])
+
+    def image_path_from_index(self, index):
+        """
+        Construct an image path from the image's "index" identifier.
+        """
+        # Example image path for index=119993:
+        #   images/train2014/COCO_train2014_000000119993.jpg
+        file_name = ('COCO_' + self._data_name + '_' +
+                     str(index).zfill(12) + '.jpg')
+        image_path = osp.join(self._data_path, 'images',
+                              self._data_name, file_name)
+        assert osp.exists(image_path), \
+                'Path does not exist: {}'.format(image_path)
+        return image_path
+
+    def selective_search_roidb(self):
+        return self._roidb_from_proposals('selective_search')
+
+    def edge_boxes_roidb(self):
+        return self._roidb_from_proposals('edge_boxes_AR')
+
+    def mcg_roidb(self):
+        return self._roidb_from_proposals('MCG')
+
+    def _roidb_from_proposals(self, method):
+        """
+        Creates a roidb from pre-computed proposals of a particular methods.
+        """
+        top_k = self.config['top_k']
+        cache_file = osp.join(self.cache_path, self.name +
+                              '_{:s}_top{:d}'.format(method, top_k) +
+                              '_roidb.pkl')
+
+        if osp.exists(cache_file):
+            with open(cache_file, 'rb') as fid:
+                roidb = cPickle.load(fid)
+            print '{:s} {:s} roidb loaded from {:s}'.format(self.name, method,
+                                                            cache_file)
+            return roidb
+
+        if self._image_set in self._gt_splits:
+            gt_roidb = self.gt_roidb()
+            method_roidb = self._load_proposals(method, gt_roidb)
+            roidb = imdb.merge_roidbs(gt_roidb, method_roidb)
+            # Make sure we don't use proposals that are contained in crowds
+            roidb = _filter_crowd_proposals(roidb, self.config['crowd_thresh'])
+        else:
+            roidb = self._load_proposals(method, None)
+        with open(cache_file, 'wb') as fid:
+            cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
+        print 'wrote {:s} roidb to {:s}'.format(method, cache_file)
+        return roidb
+
+    def _load_proposals(self, method, gt_roidb):
+        """
+        Load pre-computed proposals in the format provided by Jan Hosang:
+        http://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-
+          computing/research/object-recognition-and-scene-understanding/how-
+          good-are-detection-proposals-really/
+        For MCG, use boxes from http://www.eecs.berkeley.edu/Research/Projects/
+          CS/vision/grouping/mcg/ and convert the file layout using
+        lib/datasets/tools/mcg_munge.py.
+        """
+        box_list = []
+        top_k = self.config['top_k']
+        valid_methods = [
+            'MCG',
+            'selective_search',
+            'edge_boxes_AR',
+            'edge_boxes_70']
+        assert method in valid_methods
+
+        print 'Loading {} boxes'.format(method)
+        for i, index in enumerate(self._image_index):
+            if i % 1000 == 0:
+                print '{:d} / {:d}'.format(i + 1, len(self._image_index))
+
+            box_file = osp.join(
+                cfg.DATA_DIR, 'coco_proposals', method, 'mat',
+                self._get_box_file(index))
+
+            raw_data = sio.loadmat(box_file)['boxes']
+            boxes = np.maximum(raw_data - 1, 0).astype(np.uint16)
+            if method == 'MCG':
+                # Boxes from the MCG website are in (y1, x1, y2, x2) order
+                boxes = boxes[:, (1, 0, 3, 2)]
+            # Remove duplicate boxes and very small boxes and then take top k
+            keep = ds_utils.unique_boxes(boxes)
+            boxes = boxes[keep, :]
+            keep = ds_utils.filter_small_boxes(boxes, self.config['min_size'])
+            boxes = boxes[keep, :]
+            boxes = boxes[:top_k, :]
+            box_list.append(boxes)
+            # Sanity check
+            im_ann = self._COCO.loadImgs(index)[0]
+            width = im_ann['width']
+            height = im_ann['height']
+            ds_utils.validate_boxes(boxes, width=width, height=height)
+        return self.create_roidb_from_box_list(box_list, gt_roidb)
+
+    def gt_roidb(self):
+        """
+        Return the database of ground-truth regions of interest.
+        This function loads/saves from/to a cache file to speed up future calls.
+        """
+        cache_file = osp.join(self.cache_path, self.name + '_gt_roidb.pkl')
+        if osp.exists(cache_file):
+            with open(cache_file, 'rb') as fid:
+                roidb = cPickle.load(fid)
+            print '{} gt roidb loaded from {}'.format(self.name, cache_file)
+            return roidb
+
+        gt_roidb = [self._load_coco_annotation(index)
+                    for index in self._image_index]
+
+        with open(cache_file, 'wb') as fid:
+            cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
+        print 'wrote gt roidb to {}'.format(cache_file)
+        return gt_roidb
+
+    def _load_coco_annotation(self, index):
+        """
+        Loads COCO bounding-box instance annotations. Crowd instances are
+        handled by marking their overlaps (with all categories) to -1. This
+        overlap value means that crowd "instances" are excluded from training.
+        """
+        im_ann = self._COCO.loadImgs(index)[0]
+        width = im_ann['width']
+        height = im_ann['height']
+
+        annIds = self._COCO.getAnnIds(imgIds=index, iscrowd=None)
+        objs = self._COCO.loadAnns(annIds)
+        # Sanitize bboxes -- some are invalid
+        valid_objs = []
+        for obj in objs:
+            x1 = np.max((0, obj['bbox'][0]))
+            y1 = np.max((0, obj['bbox'][1]))
+            x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1))))
+            y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1))))
+            if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
+                obj['clean_bbox'] = [x1, y1, x2, y2]
+                valid_objs.append(obj)
+        objs = valid_objs
+        num_objs = len(objs)
+
+        boxes = np.zeros((num_objs, 4), dtype=np.uint16)
+        gt_classes = np.zeros((num_objs), dtype=np.int32)
+        overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
+        seg_areas = np.zeros((num_objs), dtype=np.float32)
+
+        # Lookup table to map from COCO category ids to our internal class
+        # indices
+        coco_cat_id_to_class_ind = dict([(self._class_to_coco_cat_id[cls],
+                                          self._class_to_ind[cls])
+                                         for cls in self._classes[1:]])
+
+        for ix, obj in enumerate(objs):
+            cls = coco_cat_id_to_class_ind[obj['category_id']]
+            boxes[ix, :] = obj['clean_bbox']
+            gt_classes[ix] = cls
+            seg_areas[ix] = obj['area']
+            if obj['iscrowd']:
+                # Set overlap to -1 for all classes for crowd objects
+                # so they will be excluded during training
+                overlaps[ix, :] = -1.0
+            else:
+                overlaps[ix, cls] = 1.0
+
+        ds_utils.validate_boxes(boxes, width=width, height=height)
+        overlaps = scipy.sparse.csr_matrix(overlaps)
+        return {'boxes' : boxes,
+                'gt_classes': gt_classes,
+                'gt_overlaps' : overlaps,
+                'flipped' : False,
+                'seg_areas' : seg_areas}
+
+    def _get_box_file(self, index):
+        # first 14 chars / first 22 chars / all chars + .mat
+        # COCO_val2014_0/COCO_val2014_000000447/COCO_val2014_000000447991.mat
+        file_name = ('COCO_' + self._data_name +
+                     '_' + str(index).zfill(12) + '.mat')
+        return osp.join(file_name[:14], file_name[:22], file_name)
+
+    def _print_detection_eval_metrics(self, coco_eval):
+        IoU_lo_thresh = 0.5
+        IoU_hi_thresh = 0.95
+        def _get_thr_ind(coco_eval, thr):
+            ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) &
+                           (coco_eval.params.iouThrs < thr + 1e-5))[0][0]
+            iou_thr = coco_eval.params.iouThrs[ind]
+            assert np.isclose(iou_thr, thr)
+            return ind
+
+        ind_lo = _get_thr_ind(coco_eval, IoU_lo_thresh)
+        ind_hi = _get_thr_ind(coco_eval, IoU_hi_thresh)
+        # precision has dims (iou, recall, cls, area range, max dets)
+        # area range index 0: all area ranges
+        # max dets index 2: 100 per image
+        precision = \
+            coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2]
+        ap_default = np.mean(precision[precision > -1])
+        print ('~~~~ Mean and per-category AP @ IoU=[{:.2f},{:.2f}] '
+               '~~~~').format(IoU_lo_thresh, IoU_hi_thresh)
+        print '{:.1f}'.format(100 * ap_default)
+        for cls_ind, cls in enumerate(self.classes):
+            if cls == '__background__':
+                continue
+            # minus 1 because of __background__
+            precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2]
+            ap = np.mean(precision[precision > -1])
+            print '{:.1f}'.format(100 * ap)
+
+        print '~~~~ Summary metrics ~~~~'
+        coco_eval.summarize()
+
+    def _do_detection_eval(self, res_file, output_dir):
+        ann_type = 'bbox'
+        coco_dt = self._COCO.loadRes(res_file)
+        coco_eval = COCOeval(self._COCO, coco_dt)
+        coco_eval.params.useSegm = (ann_type == 'segm')
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        self._print_detection_eval_metrics(coco_eval)
+        eval_file = osp.join(output_dir, 'detection_results.pkl')
+        with open(eval_file, 'wb') as fid:
+            cPickle.dump(coco_eval, fid, cPickle.HIGHEST_PROTOCOL)
+        print 'Wrote COCO eval results to: {}'.format(eval_file)
+
+    def _coco_results_one_category(self, boxes, cat_id):
+        results = []
+        for im_ind, index in enumerate(self.image_index):
+            dets = boxes[im_ind].astype(np.float)
+            if dets == []:
+                continue
+            scores = dets[:, -1]
+            xs = dets[:, 0]
+            ys = dets[:, 1]
+            ws = dets[:, 2] - xs + 1
+            hs = dets[:, 3] - ys + 1
+            results.extend(
+              [{'image_id' : index,
+                'category_id' : cat_id,
+                'bbox' : [xs[k], ys[k], ws[k], hs[k]],
+                'score' : scores[k]} for k in xrange(dets.shape[0])])
+        return results
+
+    def _write_coco_results_file(self, all_boxes, res_file):
+        # [{"image_id": 42,
+        #   "category_id": 18,
+        #   "bbox": [258.15,41.29,348.26,243.78],
+        #   "score": 0.236}, ...]
+        results = []
+        for cls_ind, cls in enumerate(self.classes):
+            if cls == '__background__':
+                continue
+            print 'Collecting {} results ({:d}/{:d})'.format(cls, cls_ind,
+                                                          self.num_classes - 1)
+            coco_cat_id = self._class_to_coco_cat_id[cls]
+            results.extend(self._coco_results_one_category(all_boxes[cls_ind],
+                                                           coco_cat_id))
+        print 'Writing results json to {}'.format(res_file)
+        with open(res_file, 'w') as fid:
+            json.dump(results, fid)
+
+    def evaluate_detections(self, all_boxes, output_dir):
+        res_file = osp.join(output_dir, ('detections_' +
+                                         self._image_set +
+                                         self._year +
+                                         '_results'))
+        if self.config['use_salt']:
+            res_file += '_{}'.format(str(uuid.uuid4()))
+        res_file += '.json'
+        self._write_coco_results_file(all_boxes, res_file)
+        # Only do evaluation on non-test sets
+        if self._image_set.find('test') == -1:
+            self._do_detection_eval(res_file, output_dir)
+        # Optionally cleanup results json file
+        if self.config['cleanup']:
+            os.remove(res_file)
+
+    def competition_mode(self, on):
+        if on:
+            self.config['use_salt'] = False
+            self.config['cleanup'] = False
+        else:
+            self.config['use_salt'] = True
+            self.config['cleanup'] = True
diff --git a/examples/faster-rcnn/lib/datasets/ds_utils.py b/examples/faster-rcnn/lib/datasets/ds_utils.py
new file mode 100644
index 000000000..f66a7f687
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/ds_utils.py
@@ -0,0 +1,41 @@
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+
+def unique_boxes(boxes, scale=1.0):
+    """Return indices of unique boxes."""
+    v = np.array([1, 1e3, 1e6, 1e9])
+    hashes = np.round(boxes * scale).dot(v)
+    _, index = np.unique(hashes, return_index=True)
+    return np.sort(index)
+
+def xywh_to_xyxy(boxes):
+    """Convert [x y w h] box format to [x1 y1 x2 y2] format."""
+    return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1))
+
+def xyxy_to_xywh(boxes):
+    """Convert [x1 y1 x2 y2] box format to [x y w h] format."""
+    return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1))
+
+def validate_boxes(boxes, width=0, height=0):
+    """Check that a set of boxes are valid."""
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    assert (x1 >= 0).all()
+    assert (y1 >= 0).all()
+    assert (x2 >= x1).all()
+    assert (y2 >= y1).all()
+    assert (x2 < width).all()
+    assert (y2 < height).all()
+
+def filter_small_boxes(boxes, min_size):
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    keep = np.where((w >= min_size) & (h > min_size))[0]
+    return keep
diff --git a/examples/faster-rcnn/lib/datasets/factory.py b/examples/faster-rcnn/lib/datasets/factory.py
new file mode 100644
index 000000000..8c3fdb898
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/factory.py
@@ -0,0 +1,42 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Factory method for easily getting imdbs by name."""
+
+__sets = {}
+
+from datasets.pascal_voc import pascal_voc
+from datasets.coco import coco
+import numpy as np
+
+# Set up voc_<year>_<split> using selective search "fast" mode
+for year in ['2007', '2012']:
+    for split in ['train', 'val', 'trainval', 'test']:
+        name = 'voc_{}_{}'.format(year, split)
+        __sets[name] = (lambda split=split, year=year: pascal_voc(split, year))
+
+# Set up coco_2014_<split>
+for year in ['2014']:
+    for split in ['train', 'val', 'minival', 'valminusminival']:
+        name = 'coco_{}_{}'.format(year, split)
+        __sets[name] = (lambda split=split, year=year: coco(split, year))
+
+# Set up coco_2015_<split>
+for year in ['2015']:
+    for split in ['test', 'test-dev']:
+        name = 'coco_{}_{}'.format(year, split)
+        __sets[name] = (lambda split=split, year=year: coco(split, year))
+
+def get_imdb(name):
+    """Get an imdb (image database) by name."""
+    if not __sets.has_key(name):
+        raise KeyError('Unknown dataset: {}'.format(name))
+    return __sets[name]()
+
+def list_imdbs():
+    """List all registered imdbs."""
+    return __sets.keys()
diff --git a/examples/faster-rcnn/lib/datasets/imdb.py b/examples/faster-rcnn/lib/datasets/imdb.py
new file mode 100644
index 000000000..b56bf0a80
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/imdb.py
@@ -0,0 +1,253 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import os
+import os.path as osp
+import PIL
+from utils.cython_bbox import bbox_overlaps
+import numpy as np
+import scipy.sparse
+from fast_rcnn.config import cfg
+
+class imdb(object):
+    """Image database."""
+
+    def __init__(self, name):
+        self._name = name
+        self._num_classes = 0
+        self._classes = []
+        self._image_index = []
+        self._obj_proposer = 'selective_search'
+        self._roidb = None
+        self._roidb_handler = self.default_roidb
+        # Use this dict for storing dataset specific config options
+        self.config = {}
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def num_classes(self):
+        return len(self._classes)
+
+    @property
+    def classes(self):
+        return self._classes
+
+    @property
+    def image_index(self):
+        return self._image_index
+
+    @property
+    def roidb_handler(self):
+        return self._roidb_handler
+
+    @roidb_handler.setter
+    def roidb_handler(self, val):
+        self._roidb_handler = val
+
+    def set_proposal_method(self, method):
+        method = eval('self.' + method + '_roidb')
+        self.roidb_handler = method
+
+    @property
+    def roidb(self):
+        # A roidb is a list of dictionaries, each with the following keys:
+        #   boxes
+        #   gt_overlaps
+        #   gt_classes
+        #   flipped
+        if self._roidb is not None:
+            return self._roidb
+        self._roidb = self.roidb_handler()
+        return self._roidb
+
+    @property
+    def cache_path(self):
+        cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache'))
+        if not os.path.exists(cache_path):
+            os.makedirs(cache_path)
+        return cache_path
+
+    @property
+    def num_images(self):
+      return len(self.image_index)
+
+    def image_path_at(self, i):
+        raise NotImplementedError
+
+    def default_roidb(self):
+        raise NotImplementedError
+
+    def evaluate_detections(self, all_boxes, output_dir=None):
+        """
+        all_boxes is a list of length number-of-classes.
+        Each list element is a list of length number-of-images.
+        Each of those list elements is either an empty list []
+        or a numpy array of detection.
+
+        all_boxes[class][image] = [] or np.array of shape #dets x 5
+        """
+        raise NotImplementedError
+
+    def _get_widths(self):
+      return [PIL.Image.open(self.image_path_at(i)).size[0]
+              for i in xrange(self.num_images)]
+
+    def append_flipped_images(self):
+        num_images = self.num_images
+        widths = self._get_widths()
+        for i in xrange(num_images):
+            boxes = self.roidb[i]['boxes'].copy()
+            oldx1 = boxes[:, 0].copy()
+            oldx2 = boxes[:, 2].copy()
+            boxes[:, 0] = widths[i] - oldx2 - 1
+            boxes[:, 2] = widths[i] - oldx1 - 1
+            assert (boxes[:, 2] >= boxes[:, 0]).all()
+            entry = {'boxes' : boxes,
+                     'gt_overlaps' : self.roidb[i]['gt_overlaps'],
+                     'gt_classes' : self.roidb[i]['gt_classes'],
+                     'flipped' : True}
+            self.roidb.append(entry)
+        self._image_index = self._image_index * 2
+
+    def evaluate_recall(self, candidate_boxes=None, thresholds=None,
+                        area='all', limit=None):
+        """Evaluate detection proposal recall metrics.
+
+        Returns:
+            results: dictionary of results with keys
+                'ar': average recall
+                'recalls': vector recalls at each IoU overlap threshold
+                'thresholds': vector of IoU overlap thresholds
+                'gt_overlaps': vector of all ground-truth overlaps
+        """
+        # Record max overlap value for each gt box
+        # Return vector of overlap values
+        areas = { 'all': 0, 'small': 1, 'medium': 2, 'large': 3,
+                  '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7}
+        area_ranges = [ [0**2, 1e5**2],    # all
+                        [0**2, 32**2],     # small
+                        [32**2, 96**2],    # medium
+                        [96**2, 1e5**2],   # large
+                        [96**2, 128**2],   # 96-128
+                        [128**2, 256**2],  # 128-256
+                        [256**2, 512**2],  # 256-512
+                        [512**2, 1e5**2],  # 512-inf
+                      ]
+        assert areas.has_key(area), 'unknown area range: {}'.format(area)
+        area_range = area_ranges[areas[area]]
+        gt_overlaps = np.zeros(0)
+        num_pos = 0
+        for i in xrange(self.num_images):
+            # Checking for max_overlaps == 1 avoids including crowd annotations
+            # (...pretty hacking :/)
+            max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1)
+            gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) &
+                               (max_gt_overlaps == 1))[0]
+            gt_boxes = self.roidb[i]['boxes'][gt_inds, :]
+            gt_areas = self.roidb[i]['seg_areas'][gt_inds]
+            valid_gt_inds = np.where((gt_areas >= area_range[0]) &
+                                     (gt_areas <= area_range[1]))[0]
+            gt_boxes = gt_boxes[valid_gt_inds, :]
+            num_pos += len(valid_gt_inds)
+
+            if candidate_boxes is None:
+                # If candidate_boxes is not supplied, the default is to use the
+                # non-ground-truth boxes from this roidb
+                non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0]
+                boxes = self.roidb[i]['boxes'][non_gt_inds, :]
+            else:
+                boxes = candidate_boxes[i]
+            if boxes.shape[0] == 0:
+                continue
+            if limit is not None and boxes.shape[0] > limit:
+                boxes = boxes[:limit, :]
+
+            overlaps = bbox_overlaps(boxes.astype(np.float),
+                                     gt_boxes.astype(np.float))
+
+            _gt_overlaps = np.zeros((gt_boxes.shape[0]))
+            for j in xrange(gt_boxes.shape[0]):
+                # find which proposal box maximally covers each gt box
+                argmax_overlaps = overlaps.argmax(axis=0)
+                # and get the iou amount of coverage for each gt box
+                max_overlaps = overlaps.max(axis=0)
+                # find which gt box is 'best' covered (i.e. 'best' = most iou)
+                gt_ind = max_overlaps.argmax()
+                gt_ovr = max_overlaps.max()
+                assert(gt_ovr >= 0)
+                # find the proposal box that covers the best covered gt box
+                box_ind = argmax_overlaps[gt_ind]
+                # record the iou coverage of this gt box
+                _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+                assert(_gt_overlaps[j] == gt_ovr)
+                # mark the proposal box and the gt box as used
+                overlaps[box_ind, :] = -1
+                overlaps[:, gt_ind] = -1
+            # append recorded iou coverage level
+            gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
+
+        gt_overlaps = np.sort(gt_overlaps)
+        if thresholds is None:
+            step = 0.05
+            thresholds = np.arange(0.5, 0.95 + 1e-5, step)
+        recalls = np.zeros_like(thresholds)
+        # compute recall for each iou threshold
+        for i, t in enumerate(thresholds):
+            recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
+        # ar = 2 * np.trapz(recalls, thresholds)
+        ar = recalls.mean()
+        return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds,
+                'gt_overlaps': gt_overlaps}
+
+    def create_roidb_from_box_list(self, box_list, gt_roidb):
+        assert len(box_list) == self.num_images, \
+                'Number of boxes must match number of ground-truth images'
+        roidb = []
+        for i in xrange(self.num_images):
+            boxes = box_list[i]
+            num_boxes = boxes.shape[0]
+            overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32)
+
+            if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
+                gt_boxes = gt_roidb[i]['boxes']
+                gt_classes = gt_roidb[i]['gt_classes']
+                gt_overlaps = bbox_overlaps(boxes.astype(np.float),
+                                            gt_boxes.astype(np.float))
+                argmaxes = gt_overlaps.argmax(axis=1)
+                maxes = gt_overlaps.max(axis=1)
+                I = np.where(maxes > 0)[0]
+                overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
+
+            overlaps = scipy.sparse.csr_matrix(overlaps)
+            roidb.append({
+                'boxes' : boxes,
+                'gt_classes' : np.zeros((num_boxes,), dtype=np.int32),
+                'gt_overlaps' : overlaps,
+                'flipped' : False,
+                'seg_areas' : np.zeros((num_boxes,), dtype=np.float32),
+            })
+        return roidb
+
+    @staticmethod
+    def merge_roidbs(a, b):
+        assert len(a) == len(b)
+        for i in xrange(len(a)):
+            a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes']))
+            a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'],
+                                            b[i]['gt_classes']))
+            a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'],
+                                                       b[i]['gt_overlaps']])
+            a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'],
+                                           b[i]['seg_areas']))
+        return a
+
+    def competition_mode(self, on):
+        """Turn competition mode on or off."""
+        pass
diff --git a/examples/faster-rcnn/lib/datasets/pascal_voc.py b/examples/faster-rcnn/lib/datasets/pascal_voc.py
new file mode 100644
index 000000000..b55f2f6b2
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/pascal_voc.py
@@ -0,0 +1,344 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import os
+from datasets.imdb import imdb
+import datasets.ds_utils as ds_utils
+import xml.etree.ElementTree as ET
+import numpy as np
+import scipy.sparse
+import scipy.io as sio
+import utils.cython_bbox
+import cPickle
+import subprocess
+import uuid
+from voc_eval import voc_eval
+from fast_rcnn.config import cfg
+
+class pascal_voc(imdb):
+    def __init__(self, image_set, year, devkit_path=None):
+        imdb.__init__(self, 'voc_' + year + '_' + image_set)
+        self._year = year
+        self._image_set = image_set
+        self._devkit_path = self._get_default_path() if devkit_path is None \
+                            else devkit_path
+        self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
+        self._classes = ('__background__', # always index 0
+                         'aeroplane', 'bicycle', 'bird', 'boat',
+                         'bottle', 'bus', 'car', 'cat', 'chair',
+                         'cow', 'diningtable', 'dog', 'horse',
+                         'motorbike', 'person', 'pottedplant',
+                         'sheep', 'sofa', 'train', 'tvmonitor')
+        self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes)))
+        self._image_ext = '.jpg'
+        self._image_index = self._load_image_set_index()
+        # Default to roidb handler
+        self._roidb_handler = self.selective_search_roidb
+        self._salt = str(uuid.uuid4())
+        self._comp_id = 'comp4'
+
+        # PASCAL specific config options
+        self.config = {'cleanup'     : True,
+                       'use_salt'    : True,
+                       'use_diff'    : False,
+                       'matlab_eval' : False,
+                       'rpn_file'    : None,
+                       'min_size'    : 2}
+
+        assert os.path.exists(self._devkit_path), \
+                'VOCdevkit path does not exist: {}'.format(self._devkit_path)
+        assert os.path.exists(self._data_path), \
+                'Path does not exist: {}'.format(self._data_path)
+
+    def image_path_at(self, i):
+        """
+        Return the absolute path to image i in the image sequence.
+        """
+        return self.image_path_from_index(self._image_index[i])
+
+    def image_path_from_index(self, index):
+        """
+        Construct an image path from the image's "index" identifier.
+        """
+        image_path = os.path.join(self._data_path, 'JPEGImages',
+                                  index + self._image_ext)
+        assert os.path.exists(image_path), \
+                'Path does not exist: {}'.format(image_path)
+        return image_path
+
+    def _load_image_set_index(self):
+        """
+        Load the indexes listed in this dataset's image set file.
+        """
+        # Example path to image set file:
+        # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
+        image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
+                                      self._image_set + '.txt')
+        assert os.path.exists(image_set_file), \
+                'Path does not exist: {}'.format(image_set_file)
+        with open(image_set_file) as f:
+            image_index = [x.strip() for x in f.readlines()]
+        return image_index
+
+    def _get_default_path(self):
+        """
+        Return the default path where PASCAL VOC is expected to be installed.
+        """
+        return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year)
+
+    def gt_roidb(self):
+        """
+        Return the database of ground-truth regions of interest.
+
+        This function loads/saves from/to a cache file to speed up future calls.
+        """
+        cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
+        if os.path.exists(cache_file):
+            with open(cache_file, 'rb') as fid:
+                roidb = cPickle.load(fid)
+            print '{} gt roidb loaded from {}'.format(self.name, cache_file)
+            return roidb
+
+        gt_roidb = [self._load_pascal_annotation(index)
+                    for index in self.image_index]
+        with open(cache_file, 'wb') as fid:
+            cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
+        print 'wrote gt roidb to {}'.format(cache_file)
+
+        return gt_roidb
+
+    def selective_search_roidb(self):
+        """
+        Return the database of selective search regions of interest.
+        Ground-truth ROIs are also included.
+
+        This function loads/saves from/to a cache file to speed up future calls.
+        """
+        cache_file = os.path.join(self.cache_path,
+                                  self.name + '_selective_search_roidb.pkl')
+
+        if os.path.exists(cache_file):
+            with open(cache_file, 'rb') as fid:
+                roidb = cPickle.load(fid)
+            print '{} ss roidb loaded from {}'.format(self.name, cache_file)
+            return roidb
+
+        if int(self._year) == 2007 or self._image_set != 'test':
+            gt_roidb = self.gt_roidb()
+            ss_roidb = self._load_selective_search_roidb(gt_roidb)
+            roidb = imdb.merge_roidbs(gt_roidb, ss_roidb)
+        else:
+            roidb = self._load_selective_search_roidb(None)
+        with open(cache_file, 'wb') as fid:
+            cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
+        print 'wrote ss roidb to {}'.format(cache_file)
+
+        return roidb
+
+    def rpn_roidb(self):
+        if int(self._year) == 2007 or self._image_set != 'test':
+            gt_roidb = self.gt_roidb()
+            rpn_roidb = self._load_rpn_roidb(gt_roidb)
+            roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb)
+        else:
+            roidb = self._load_rpn_roidb(None)
+
+        return roidb
+
+    def _load_rpn_roidb(self, gt_roidb):
+        filename = self.config['rpn_file']
+        print 'loading {}'.format(filename)
+        assert os.path.exists(filename), \
+               'rpn data not found at: {}'.format(filename)
+        with open(filename, 'rb') as f:
+            box_list = cPickle.load(f)
+        return self.create_roidb_from_box_list(box_list, gt_roidb)
+
+    def _load_selective_search_roidb(self, gt_roidb):
+        filename = os.path.abspath(os.path.join(cfg.DATA_DIR,
+                                                'selective_search_data',
+                                                self.name + '.mat'))
+        assert os.path.exists(filename), \
+               'Selective search data not found at: {}'.format(filename)
+        raw_data = sio.loadmat(filename)['boxes'].ravel()
+
+        box_list = []
+        for i in xrange(raw_data.shape[0]):
+            boxes = raw_data[i][:, (1, 0, 3, 2)] - 1
+            keep = ds_utils.unique_boxes(boxes)
+            boxes = boxes[keep, :]
+            keep = ds_utils.filter_small_boxes(boxes, self.config['min_size'])
+            boxes = boxes[keep, :]
+            box_list.append(boxes)
+
+        return self.create_roidb_from_box_list(box_list, gt_roidb)
+
+    def _load_pascal_annotation(self, index):
+        """
+        Load image and bounding boxes info from XML file in the PASCAL VOC
+        format.
+        """
+        filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
+        tree = ET.parse(filename)
+        objs = tree.findall('object')
+        if not self.config['use_diff']:
+            # Exclude the samples labeled as difficult
+            non_diff_objs = [
+                obj for obj in objs if int(obj.find('difficult').text) == 0]
+            # if len(non_diff_objs) != len(objs):
+            #     print 'Removed {} difficult objects'.format(
+            #         len(objs) - len(non_diff_objs))
+            objs = non_diff_objs
+        num_objs = len(objs)
+
+        boxes = np.zeros((num_objs, 4), dtype=np.uint16)
+        gt_classes = np.zeros((num_objs), dtype=np.int32)
+        overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
+        # "Seg" area for pascal is just the box area
+        seg_areas = np.zeros((num_objs), dtype=np.float32)
+
+        # Load object bounding boxes into a data frame.
+        for ix, obj in enumerate(objs):
+            bbox = obj.find('bndbox')
+            # Make pixel indexes 0-based
+            x1 = float(bbox.find('xmin').text) - 1
+            y1 = float(bbox.find('ymin').text) - 1
+            x2 = float(bbox.find('xmax').text) - 1
+            y2 = float(bbox.find('ymax').text) - 1
+            cls = self._class_to_ind[obj.find('name').text.lower().strip()]
+            boxes[ix, :] = [x1, y1, x2, y2]
+            gt_classes[ix] = cls
+            overlaps[ix, cls] = 1.0
+            seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
+
+        overlaps = scipy.sparse.csr_matrix(overlaps)
+
+        return {'boxes' : boxes,
+                'gt_classes': gt_classes,
+                'gt_overlaps' : overlaps,
+                'flipped' : False,
+                'seg_areas' : seg_areas}
+
+    def _get_comp_id(self):
+        comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt']
+            else self._comp_id)
+        return comp_id
+
+    def _get_voc_results_file_template(self):
+        # VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
+        filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt'
+        path = os.path.join(
+            self._devkit_path,
+            'results',
+            'VOC' + self._year,
+            'Main',
+            filename)
+        return path
+
+    def _write_voc_results_file(self, all_boxes):
+        for cls_ind, cls in enumerate(self.classes):
+            if cls == '__background__':
+                continue
+            print 'Writing {} VOC results file'.format(cls)
+            filename = self._get_voc_results_file_template().format(cls)
+            with open(filename, 'wt') as f:
+                for im_ind, index in enumerate(self.image_index):
+                    dets = all_boxes[cls_ind][im_ind]
+                    if dets == []:
+                        continue
+                    # the VOCdevkit expects 1-based indices
+                    for k in xrange(dets.shape[0]):
+                        f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
+                                format(index, dets[k, -1],
+                                       dets[k, 0] + 1, dets[k, 1] + 1,
+                                       dets[k, 2] + 1, dets[k, 3] + 1))
+
+    def _do_python_eval(self, output_dir = 'output'):
+        annopath = os.path.join(
+            self._devkit_path,
+            'VOC' + self._year,
+            'Annotations',
+            '{:s}.xml')
+        imagesetfile = os.path.join(
+            self._devkit_path,
+            'VOC' + self._year,
+            'ImageSets',
+            'Main',
+            self._image_set + '.txt')
+        cachedir = os.path.join(self._devkit_path, 'annotations_cache')
+        aps = []
+        # The PASCAL VOC metric changed in 2010
+        use_07_metric = True if int(self._year) < 2010 else False
+        print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No')
+        if not os.path.isdir(output_dir):
+            os.mkdir(output_dir)
+        for i, cls in enumerate(self._classes):
+            if cls == '__background__':
+                continue
+            filename = self._get_voc_results_file_template().format(cls)
+            rec, prec, ap = voc_eval(
+                filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
+                use_07_metric=use_07_metric)
+            aps += [ap]
+            print('AP for {} = {:.4f}'.format(cls, ap))
+            with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f:
+                cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
+        print('Mean AP = {:.4f}'.format(np.mean(aps)))
+        print('~~~~~~~~')
+        print('Results:')
+        for ap in aps:
+            print('{:.3f}'.format(ap))
+        print('{:.3f}'.format(np.mean(aps)))
+        print('~~~~~~~~')
+        print('')
+        print('--------------------------------------------------------------')
+        print('Results computed with the **unofficial** Python eval code.')
+        print('Results should be very close to the official MATLAB eval code.')
+        print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
+        print('-- Thanks, The Management')
+        print('--------------------------------------------------------------')
+
+    def _do_matlab_eval(self, output_dir='output'):
+        print '-----------------------------------------------------'
+        print 'Computing results with the official MATLAB eval code.'
+        print '-----------------------------------------------------'
+        path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets',
+                            'VOCdevkit-matlab-wrapper')
+        cmd = 'cd {} && '.format(path)
+        cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB)
+        cmd += '-r "dbstop if error; '
+        cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \
+               .format(self._devkit_path, self._get_comp_id(),
+                       self._image_set, output_dir)
+        print('Running:\n{}'.format(cmd))
+        status = subprocess.call(cmd, shell=True)
+
+    def evaluate_detections(self, all_boxes, output_dir):
+        self._write_voc_results_file(all_boxes)
+        self._do_python_eval(output_dir)
+        if self.config['matlab_eval']:
+            self._do_matlab_eval(output_dir)
+        if self.config['cleanup']:
+            for cls in self._classes:
+                if cls == '__background__':
+                    continue
+                filename = self._get_voc_results_file_template().format(cls)
+                os.remove(filename)
+
+    def competition_mode(self, on):
+        if on:
+            self.config['use_salt'] = False
+            self.config['cleanup'] = False
+        else:
+            self.config['use_salt'] = True
+            self.config['cleanup'] = True
+
+if __name__ == '__main__':
+    from datasets.pascal_voc import pascal_voc
+    d = pascal_voc('trainval', '2007')
+    res = d.roidb
+    from IPython import embed; embed()
diff --git a/examples/faster-rcnn/lib/datasets/tools/mcg_munge.py b/examples/faster-rcnn/lib/datasets/tools/mcg_munge.py
new file mode 100644
index 000000000..1392aa308
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/tools/mcg_munge.py
@@ -0,0 +1,38 @@
+import os
+import sys
+
+"""Hacky tool to convert file system layout of MCG boxes downloaded from
+http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/
+so that it's consistent with those computed by Jan Hosang (see:
+http://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-
+  computing/research/object-recognition-and-scene-understanding/how-
+  good-are-detection-proposals-really/)
+
+NB: Boxes from the MCG website are in (y1, x1, y2, x2) order.
+Boxes from Hosang et al. are in (x1, y1, x2, y2) order.
+"""
+
+def munge(src_dir):
+    # stored as: ./MCG-COCO-val2014-boxes/COCO_val2014_000000193401.mat
+    # want:      ./MCG/mat/COCO_val2014_0/COCO_val2014_000000141/COCO_val2014_000000141334.mat
+
+    files = os.listdir(src_dir)
+    for fn in files:
+        base, ext = os.path.splitext(fn)
+        # first 14 chars / first 22 chars / all chars + .mat
+        # COCO_val2014_0/COCO_val2014_000000447/COCO_val2014_000000447991.mat
+        first = base[:14]
+        second = base[:22]
+        dst_dir = os.path.join('MCG', 'mat', first, second)
+        if not os.path.exists(dst_dir):
+            os.makedirs(dst_dir)
+        src = os.path.join(src_dir, fn)
+        dst = os.path.join(dst_dir, fn)
+        print 'MV: {} -> {}'.format(src, dst)
+        os.rename(src, dst)
+
+if __name__ == '__main__':
+    # src_dir should look something like:
+    #  src_dir = 'MCG-COCO-val2014-boxes'
+    src_dir = sys.argv[1]
+    munge(src_dir)
diff --git a/examples/faster-rcnn/lib/datasets/voc_eval.py b/examples/faster-rcnn/lib/datasets/voc_eval.py
new file mode 100644
index 000000000..8d0a83076
--- /dev/null
+++ b/examples/faster-rcnn/lib/datasets/voc_eval.py
@@ -0,0 +1,200 @@
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+
+import xml.etree.ElementTree as ET
+import os
+import cPickle
+import numpy as np
+
+def parse_rec(filename):
+    """ Parse a PASCAL VOC xml file """
+    tree = ET.parse(filename)
+    objects = []
+    for obj in tree.findall('object'):
+        obj_struct = {}
+        obj_struct['name'] = obj.find('name').text
+        obj_struct['pose'] = obj.find('pose').text
+        obj_struct['truncated'] = int(obj.find('truncated').text)
+        obj_struct['difficult'] = int(obj.find('difficult').text)
+        bbox = obj.find('bndbox')
+        obj_struct['bbox'] = [int(bbox.find('xmin').text),
+                              int(bbox.find('ymin').text),
+                              int(bbox.find('xmax').text),
+                              int(bbox.find('ymax').text)]
+        objects.append(obj_struct)
+
+    return objects
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """ ap = voc_ap(rec, prec, [use_07_metric])
+    Compute VOC AP given precision and recall.
+    If use_07_metric is true, uses the
+    VOC 07 11 point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+def voc_eval(detpath,
+             annopath,
+             imagesetfile,
+             classname,
+             cachedir,
+             ovthresh=0.5,
+             use_07_metric=False):
+    """rec, prec, ap = voc_eval(detpath,
+                                annopath,
+                                imagesetfile,
+                                classname,
+                                [ovthresh],
+                                [use_07_metric])
+
+    Top level function that does the PASCAL VOC evaluation.
+
+    detpath: Path to detections
+        detpath.format(classname) should produce the detection results file.
+    annopath: Path to annotations
+        annopath.format(imagename) should be the xml annotations file.
+    imagesetfile: Text file containing the list of images, one image per line.
+    classname: Category name (duh)
+    cachedir: Directory for caching the annotations
+    [ovthresh]: Overlap threshold (default = 0.5)
+    [use_07_metric]: Whether to use VOC07's 11 point AP computation
+        (default False)
+    """
+    # assumes detections are in detpath.format(classname)
+    # assumes annotations are in annopath.format(imagename)
+    # assumes imagesetfile is a text file with each line an image name
+    # cachedir caches the annotations in a pickle file
+
+    # first load gt
+    if not os.path.isdir(cachedir):
+        os.mkdir(cachedir)
+    cachefile = os.path.join(cachedir, 'annots.pkl')
+    # read list of images
+    with open(imagesetfile, 'r') as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+
+    if not os.path.isfile(cachefile):
+        # load annots
+        recs = {}
+        for i, imagename in enumerate(imagenames):
+            recs[imagename] = parse_rec(annopath.format(imagename))
+            if i % 100 == 0:
+                print 'Reading annotation for {:d}/{:d}'.format(
+                    i + 1, len(imagenames))
+        # save
+        print 'Saving cached annotations to {:s}'.format(cachefile)
+        with open(cachefile, 'w') as f:
+            cPickle.dump(recs, f)
+    else:
+        # load
+        with open(cachefile, 'r') as f:
+            recs = cPickle.load(f)
+
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj['name'] == classname]
+        bbox = np.array([x['bbox'] for x in R])
+        difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {'bbox': bbox,
+                                 'difficult': difficult,
+                                 'det': det}
+
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, 'r') as f:
+        lines = f.readlines()
+
+    splitlines = [x.strip().split(' ') for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    sorted_scores = np.sort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R['bbox'].astype(float)
+
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1., 0.)
+            ih = np.maximum(iymax - iymin + 1., 0.)
+            inters = iw * ih
+
+            # union
+            uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
+                   (BBGT[:, 2] - BBGT[:, 0] + 1.) *
+                   (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > ovthresh:
+            if not R['difficult'][jmax]:
+                if not R['det'][jmax]:
+                    tp[d] = 1.
+                    R['det'][jmax] = 1
+                else:
+                    fp[d] = 1.
+        else:
+            fp[d] = 1.
+
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    return rec, prec, ap
diff --git a/examples/faster-rcnn/lib/fast_rcnn/__init__.py b/examples/faster-rcnn/lib/fast_rcnn/__init__.py
new file mode 100644
index 000000000..7ba6a65c5
--- /dev/null
+++ b/examples/faster-rcnn/lib/fast_rcnn/__init__.py
@@ -0,0 +1,6 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
diff --git a/examples/faster-rcnn/lib/fast_rcnn/bbox_transform.py b/examples/faster-rcnn/lib/fast_rcnn/bbox_transform.py
new file mode 100644
index 000000000..c02916305
--- /dev/null
+++ b/examples/faster-rcnn/lib/fast_rcnn/bbox_transform.py
@@ -0,0 +1,76 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+
+def bbox_transform(ex_rois, gt_rois):
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
+
+    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
+    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
+    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
+    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
+    targets_dw = np.log(gt_widths / ex_widths)
+    targets_dh = np.log(gt_heights / ex_heights)
+
+    targets = np.vstack(
+        (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+    return targets
+
+def bbox_transform_inv(boxes, deltas):
+    if boxes.shape[0] == 0:
+        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+
+    boxes = boxes.astype(deltas.dtype, copy=False)
+
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+
+    dx = deltas[:, 0::4]
+    dy = deltas[:, 1::4]
+    dw = deltas[:, 2::4]
+    dh = deltas[:, 3::4]
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    # x2
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
+    # y2
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
+
+    return pred_boxes
+
+def clip_boxes(boxes, im_shape):
+    """
+    Clip boxes to image boundaries.
+    """
+
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+    return boxes
diff --git a/examples/faster-rcnn/lib/fast_rcnn/config.py b/examples/faster-rcnn/lib/fast_rcnn/config.py
new file mode 100644
index 000000000..beb6c7e08
--- /dev/null
+++ b/examples/faster-rcnn/lib/fast_rcnn/config.py
@@ -0,0 +1,285 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Fast R-CNN config system.
+
+This file specifies default config options for Fast R-CNN. You should not
+change values in this file. Instead, you should write a config file (in yaml)
+and use cfg_from_file(yaml_file) to load it and override the default options.
+
+Most tools in $ROOT/tools take a --cfg option to specify an override file.
+    - See tools/{train,test}_net.py for example code that uses cfg_from_file()
+    - See experiments/cfgs/*.yml for example YAML config override files
+"""
+
+import os
+import os.path as osp
+import numpy as np
+# `pip install easydict` if you don't have it
+from easydict import EasyDict as edict
+
+__C = edict()
+# Consumers can get config by:
+#   from fast_rcnn_config import cfg
+cfg = __C
+
+#
+# Training options
+#
+
+__C.TRAIN = edict()
+
+# Scales to use during training (can list multiple scales)
+# Each scale is the pixel size of an image's shortest side
+__C.TRAIN.SCALES = (600,)
+
+# Max pixel size of the longest side of a scaled input image
+__C.TRAIN.MAX_SIZE = 1000
+
+# Images to use per minibatch
+__C.TRAIN.IMS_PER_BATCH = 2
+
+# Minibatch size (number of regions of interest [ROIs])
+__C.TRAIN.BATCH_SIZE = 128
+
+# Fraction of minibatch that is labeled foreground (i.e. class > 0)
+__C.TRAIN.FG_FRACTION = 0.25
+
+# Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
+__C.TRAIN.FG_THRESH = 0.5
+
+# Overlap threshold for a ROI to be considered background (class = 0 if
+# overlap in [LO, HI))
+__C.TRAIN.BG_THRESH_HI = 0.5
+__C.TRAIN.BG_THRESH_LO = 0.1
+
+# Use horizontally-flipped images during training?
+__C.TRAIN.USE_FLIPPED = True
+
+# Train bounding-box regressors
+__C.TRAIN.BBOX_REG = True
+
+# Overlap required between a ROI and ground-truth box in order for that ROI to
+# be used as a bounding-box regression training example
+__C.TRAIN.BBOX_THRESH = 0.5
+
+# Iterations between snapshots
+__C.TRAIN.SNAPSHOT_ITERS = 10000
+
+# solver.prototxt specifies the snapshot path prefix, this adds an optional
+# infix to yield the path: <prefix>[_<infix>]_iters_XYZ.caffemodel
+__C.TRAIN.SNAPSHOT_INFIX = ''
+
+# Use a prefetch thread in roi_data_layer.layer
+# So far I haven't found this useful; likely more engineering work is required
+__C.TRAIN.USE_PREFETCH = False
+
+# Normalize the targets (subtract empirical mean, divide by empirical stddev)
+__C.TRAIN.BBOX_NORMALIZE_TARGETS = True
+# Deprecated (inside weights)
+__C.TRAIN.BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# Normalize the targets using "precomputed" (or made up) means and stdevs
+# (BBOX_NORMALIZE_TARGETS must also be True)
+__C.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = False
+__C.TRAIN.BBOX_NORMALIZE_MEANS = (0.0, 0.0, 0.0, 0.0)
+__C.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2)
+
+# Train using these proposals
+__C.TRAIN.PROPOSAL_METHOD = 'selective_search'
+
+# Make minibatches from images that have similar aspect ratios (i.e. both
+# tall and thin or both short and wide) in order to avoid wasting computation
+# on zero-padding.
+__C.TRAIN.ASPECT_GROUPING = True
+
+# Use RPN to detect objects
+__C.TRAIN.HAS_RPN = False
+# IOU >= thresh: positive example
+__C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
+# IOU < thresh: negative example
+__C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3
+# If an anchor statisfied by positive and negative conditions set to negative
+__C.TRAIN.RPN_CLOBBER_POSITIVES = False
+# Max number of foreground examples
+__C.TRAIN.RPN_FG_FRACTION = 0.5
+# Total number of examples
+__C.TRAIN.RPN_BATCHSIZE = 256
+# NMS threshold used on RPN proposals
+__C.TRAIN.RPN_NMS_THRESH = 0.7
+# Number of top scoring boxes to keep before apply NMS to RPN proposals
+__C.TRAIN.RPN_PRE_NMS_TOP_N = 12000
+# Number of top scoring boxes to keep after applying NMS to RPN proposals
+__C.TRAIN.RPN_POST_NMS_TOP_N = 2000
+# Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale)
+__C.TRAIN.RPN_MIN_SIZE = 16
+# Deprecated (outside weights)
+__C.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+# Give the positive RPN examples weight of p * 1 / {num positives}
+# and give negatives a weight of (1 - p)
+# Set to -1.0 to use uniform example weighting
+__C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0
+
+
+#
+# Testing options
+#
+
+__C.TEST = edict()
+
+# Scales to use during testing (can list multiple scales)
+# Each scale is the pixel size of an image's shortest side
+__C.TEST.SCALES = (600,)
+
+# Max pixel size of the longest side of a scaled input image
+__C.TEST.MAX_SIZE = 1000
+
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+__C.TEST.NMS = 0.3
+
+# Experimental: treat the (K+1) units in the cls_score layer as linear
+# predictors (trained, eg, with one-vs-rest SVMs).
+__C.TEST.SVM = False
+
+# Test using bounding-box regressors
+__C.TEST.BBOX_REG = True
+
+# Propose boxes
+__C.TEST.HAS_RPN = False
+
+# Test using these proposals
+__C.TEST.PROPOSAL_METHOD = 'selective_search'
+
+## NMS threshold used on RPN proposals
+__C.TEST.RPN_NMS_THRESH = 0.7
+## Number of top scoring boxes to keep before apply NMS to RPN proposals
+__C.TEST.RPN_PRE_NMS_TOP_N = 6000
+## Number of top scoring boxes to keep after applying NMS to RPN proposals
+__C.TEST.RPN_POST_NMS_TOP_N = 300
+# Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale)
+__C.TEST.RPN_MIN_SIZE = 16
+
+
+#
+# MISC
+#
+
+# The mapping from image coordinates to feature map coordinates might cause
+# some boxes that are distinct in image space to become identical in feature
+# coordinates. If DEDUP_BOXES > 0, then DEDUP_BOXES is used as the scale factor
+# for identifying duplicate boxes.
+# 1/16 is correct for {Alex,Caffe}Net, VGG_CNN_M_1024, and VGG16
+__C.DEDUP_BOXES = 1./16.
+
+# Pixel mean values (BGR order) as a (1, 1, 3) array
+# We use the same pixel mean for all networks even though it's not exactly what
+# they were trained with
+__C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]])
+
+# For reproducibility
+__C.RNG_SEED = 3
+
+# A small number that's used many times
+__C.EPS = 1e-14
+
+# Root directory of project
+__C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..'))
+
+# Data directory
+__C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data'))
+
+# Model directory
+__C.MODELS_DIR = osp.abspath(osp.join(__C.ROOT_DIR, '..', '..', 'models', 'intel_optimized_models', 'faster-rcnn', 'pascal_voc'))
+
+# Name (or path to) the matlab executable
+__C.MATLAB = 'matlab'
+
+# Place outputs under an experiments directory
+__C.EXP_DIR = 'default'
+
+# Use GPU implementation of non-maximum suppression
+__C.USE_GPU_NMS = True
+
+# Default GPU device id
+__C.GPU_ID = 0
+
+
+def get_output_dir(imdb, net=None):
+    """Return the directory where experimental artifacts are placed.
+    If the directory does not exist, it is created.
+
+    A canonical path is built using the name from an imdb and a network
+    (if not None).
+    """
+    outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name))
+    if net is not None:
+        outdir = osp.join(outdir, net.name)
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+    return outdir
+
+def _merge_a_into_b(a, b):
+    """Merge config dictionary a into config dictionary b, clobbering the
+    options in b whenever they are also specified in a.
+    """
+    if type(a) is not edict:
+        return
+
+    for k, v in a.iteritems():
+        # a must specify keys that are in b
+        if not b.has_key(k):
+            raise KeyError('{} is not a valid config key'.format(k))
+
+        # the types must match, too
+        old_type = type(b[k])
+        if old_type is not type(v):
+            if isinstance(b[k], np.ndarray):
+                v = np.array(v, dtype=b[k].dtype)
+            else:
+                raise ValueError(('Type mismatch ({} vs. {}) '
+                                'for config key: {}').format(type(b[k]),
+                                                            type(v), k))
+
+        # recursively merge dicts
+        if type(v) is edict:
+            try:
+                _merge_a_into_b(a[k], b[k])
+            except:
+                print('Error under config key: {}'.format(k))
+                raise
+        else:
+            b[k] = v
+
+def cfg_from_file(filename):
+    """Load a config file and merge it into the default options."""
+    import yaml
+    with open(filename, 'r') as f:
+        yaml_cfg = edict(yaml.load(f))
+
+    _merge_a_into_b(yaml_cfg, __C)
+
+def cfg_from_list(cfg_list):
+    """Set config keys via list (e.g., from command line)."""
+    from ast import literal_eval
+    assert len(cfg_list) % 2 == 0
+    for k, v in zip(cfg_list[0::2], cfg_list[1::2]):
+        key_list = k.split('.')
+        d = __C
+        for subkey in key_list[:-1]:
+            assert d.has_key(subkey)
+            d = d[subkey]
+        subkey = key_list[-1]
+        assert d.has_key(subkey)
+        try:
+            value = literal_eval(v)
+        except:
+            # handle the case when v is a string literal
+            value = v
+        assert type(value) == type(d[subkey]), \
+            'type {} does not match original type {}'.format(
+            type(value), type(d[subkey]))
+        d[subkey] = value
diff --git a/examples/faster-rcnn/lib/fast_rcnn/nms_wrapper.py b/examples/faster-rcnn/lib/fast_rcnn/nms_wrapper.py
new file mode 100644
index 000000000..a59485e69
--- /dev/null
+++ b/examples/faster-rcnn/lib/fast_rcnn/nms_wrapper.py
@@ -0,0 +1,20 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+from fast_rcnn.config import cfg
+#from nms.gpu_nms import gpu_nms
+from nms.cpu_nms import cpu_nms
+
+def nms(dets, thresh, force_cpu=True):
+    """Dispatch to either CPU or GPU NMS implementations."""
+
+    if dets.shape[0] == 0:
+        return []
+    if cfg.USE_GPU_NMS and not force_cpu:
+        return gpu_nms(dets, thresh, device_id=cfg.GPU_ID)
+    else:
+        return cpu_nms(dets, thresh)
diff --git a/examples/faster-rcnn/lib/fast_rcnn/test.py b/examples/faster-rcnn/lib/fast_rcnn/test.py
new file mode 100644
index 000000000..f889d0977
--- /dev/null
+++ b/examples/faster-rcnn/lib/fast_rcnn/test.py
@@ -0,0 +1,295 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Test a Fast R-CNN network on an imdb (image database)."""
+
+from fast_rcnn.config import cfg, get_output_dir
+from fast_rcnn.bbox_transform import clip_boxes, bbox_transform_inv
+import argparse
+from utils.timer import Timer
+import numpy as np
+import cv2
+import caffe
+from fast_rcnn.nms_wrapper import nms
+import cPickle
+from utils.blob import im_list_to_blob
+import os
+
+def _get_image_blob(im):
+    """Converts an image into a network input.
+
+    Arguments:
+        im (ndarray): a color image in BGR order
+
+    Returns:
+        blob (ndarray): a data blob holding an image pyramid
+        im_scale_factors (list): list of image scales (relative to im) used
+            in the image pyramid
+    """
+    im_orig = im.astype(np.float32, copy=True)
+    im_orig -= cfg.PIXEL_MEANS
+
+    im_shape = im_orig.shape
+    im_size_min = np.min(im_shape[0:2])
+    im_size_max = np.max(im_shape[0:2])
+
+    processed_ims = []
+    im_scale_factors = []
+
+    for target_size in cfg.TEST.SCALES:
+        im_scale = float(target_size) / float(im_size_min)
+        # Prevent the biggest axis from being more than MAX_SIZE
+        if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
+            im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
+        im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
+                        interpolation=cv2.INTER_LINEAR)
+        im_scale_factors.append(im_scale)
+        processed_ims.append(im)
+
+    # Create a blob to hold the input images
+    blob = im_list_to_blob(processed_ims)
+
+    return blob, np.array(im_scale_factors)
+
+def _get_rois_blob(im_rois, im_scale_factors):
+    """Converts RoIs into network inputs.
+
+    Arguments:
+        im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates
+        im_scale_factors (list): scale factors as returned by _get_image_blob
+
+    Returns:
+        blob (ndarray): R x 5 matrix of RoIs in the image pyramid
+    """
+    rois, levels = _project_im_rois(im_rois, im_scale_factors)
+    rois_blob = np.hstack((levels, rois))
+    return rois_blob.astype(np.float32, copy=False)
+
+def _project_im_rois(im_rois, scales):
+    """Project image RoIs into the image pyramid built by _get_image_blob.
+
+    Arguments:
+        im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates
+        scales (list): scale factors as returned by _get_image_blob
+
+    Returns:
+        rois (ndarray): R x 4 matrix of projected RoI coordinates
+        levels (list): image pyramid levels used by each projected RoI
+    """
+    im_rois = im_rois.astype(np.float, copy=False)
+
+    if len(scales) > 1:
+        widths = im_rois[:, 2] - im_rois[:, 0] + 1
+        heights = im_rois[:, 3] - im_rois[:, 1] + 1
+
+        areas = widths * heights
+        scaled_areas = areas[:, np.newaxis] * (scales[np.newaxis, :] ** 2)
+        diff_areas = np.abs(scaled_areas - 224 * 224)
+        levels = diff_areas.argmin(axis=1)[:, np.newaxis]
+    else:
+        levels = np.zeros((im_rois.shape[0], 1), dtype=np.int)
+
+    rois = im_rois * scales[levels]
+
+    return rois, levels
+
+def _get_blobs(im, rois):
+    """Convert an image and RoIs within that image into network inputs."""
+    blobs = {'data' : None, 'rois' : None}
+    blobs['data'], im_scale_factors = _get_image_blob(im)
+    if not cfg.TEST.HAS_RPN:
+        blobs['rois'] = _get_rois_blob(rois, im_scale_factors)
+    return blobs, im_scale_factors
+
+def im_detect(net, im, boxes=None):
+    """Detect object classes in an image given object proposals.
+
+    Arguments:
+        net (caffe.Net): Fast R-CNN network to use
+        im (ndarray): color image to test (in BGR order)
+        boxes (ndarray): R x 4 array of object proposals or None (for RPN)
+
+    Returns:
+        scores (ndarray): R x K array of object class scores (K includes
+            background as object category 0)
+        boxes (ndarray): R x (4*K) array of predicted bounding boxes
+    """
+    blobs, im_scales = _get_blobs(im, boxes)
+
+    # When mapping from image ROIs to feature map ROIs, there's some aliasing
+    # (some distinct image ROIs get mapped to the same feature ROI).
+    # Here, we identify duplicate feature ROIs, so we only compute features
+    # on the unique subset.
+    if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN:
+        v = np.array([1, 1e3, 1e6, 1e9, 1e12])
+        hashes = np.round(blobs['rois'] * cfg.DEDUP_BOXES).dot(v)
+        _, index, inv_index = np.unique(hashes, return_index=True,
+                                        return_inverse=True)
+        blobs['rois'] = blobs['rois'][index, :]
+        boxes = boxes[index, :]
+
+    if cfg.TEST.HAS_RPN:
+        im_blob = blobs['data']
+        blobs['im_info'] = np.array(
+            [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]],
+            dtype=np.float32)
+
+    # reshape network inputs
+    net.blobs['data'].reshape(*(blobs['data'].shape))
+    if cfg.TEST.HAS_RPN:
+        net.blobs['im_info'].reshape(*(blobs['im_info'].shape))
+    else:
+        net.blobs['rois'].reshape(*(blobs['rois'].shape))
+
+    # do forward
+    forward_kwargs = {'data': blobs['data'].astype(np.float32, copy=False)}
+    if cfg.TEST.HAS_RPN:
+        forward_kwargs['im_info'] = blobs['im_info'].astype(np.float32, copy=False)
+    else:
+        forward_kwargs['rois'] = blobs['rois'].astype(np.float32, copy=False)
+    blobs_out = net.forward(**forward_kwargs)
+
+    if cfg.TEST.HAS_RPN:
+        assert len(im_scales) == 1, "Only single-image batch implemented"
+        rois = net.blobs['rois'].data.copy()
+        # unscale back to raw image space
+        boxes = rois[:, 1:5] / im_scales[0]
+
+    if cfg.TEST.SVM:
+        # use the raw scores before softmax under the assumption they
+        # were trained as linear SVMs
+        scores = net.blobs['cls_score'].data
+    else:
+        # use softmax estimated probabilities
+        scores = blobs_out['cls_prob']
+
+    if cfg.TEST.BBOX_REG:
+        # Apply bounding-box regression deltas
+        box_deltas = blobs_out['bbox_pred']
+        pred_boxes = bbox_transform_inv(boxes, box_deltas)
+        pred_boxes = clip_boxes(pred_boxes, im.shape)
+    else:
+        # Simply repeat the boxes, once for each class
+        pred_boxes = np.tile(boxes, (1, scores.shape[1]))
+
+    if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN:
+        # Map scores and predictions back to the original set of boxes
+        scores = scores[inv_index, :]
+        pred_boxes = pred_boxes[inv_index, :]
+
+    return scores, pred_boxes
+
+def vis_detections(im, class_name, dets, thresh=0.3):
+    """Visual debugging of detections."""
+    import matplotlib.pyplot as plt
+    im = im[:, :, (2, 1, 0)]
+    for i in xrange(np.minimum(10, dets.shape[0])):
+        bbox = dets[i, :4]
+        score = dets[i, -1]
+        if score > thresh:
+            plt.cla()
+            plt.imshow(im)
+            plt.gca().add_patch(
+                plt.Rectangle((bbox[0], bbox[1]),
+                              bbox[2] - bbox[0],
+                              bbox[3] - bbox[1], fill=False,
+                              edgecolor='g', linewidth=3)
+                )
+            plt.title('{}  {:.3f}'.format(class_name, score))
+            plt.show()
+
+def apply_nms(all_boxes, thresh):
+    """Apply non-maximum suppression to all predicted boxes output by the
+    test_net method.
+    """
+    num_classes = len(all_boxes)
+    num_images = len(all_boxes[0])
+    nms_boxes = [[[] for _ in xrange(num_images)]
+                 for _ in xrange(num_classes)]
+    for cls_ind in xrange(num_classes):
+        for im_ind in xrange(num_images):
+            dets = all_boxes[cls_ind][im_ind]
+            if dets == []:
+                continue
+            # CPU NMS is much faster than GPU NMS when the number of boxes
+            # is relative small (e.g., < 10k)
+            # TODO(rbg): autotune NMS dispatch
+            keep = nms(dets, thresh, force_cpu=True)
+            if len(keep) == 0:
+                continue
+            nms_boxes[cls_ind][im_ind] = dets[keep, :].copy()
+    return nms_boxes
+
+def test_net(net, imdb, max_per_image=100, thresh=0.05, vis=False):
+    """Test a Fast R-CNN network on an image database."""
+    num_images = len(imdb.image_index)
+    # all detections are collected into:
+    #    all_boxes[cls][image] = N x 5 array of detections in
+    #    (x1, y1, x2, y2, score)
+    all_boxes = [[[] for _ in xrange(num_images)]
+                 for _ in xrange(imdb.num_classes)]
+
+    output_dir = get_output_dir(imdb, net)
+
+    # timers
+    _t = {'im_detect' : Timer(), 'misc' : Timer()}
+
+    if not cfg.TEST.HAS_RPN:
+        roidb = imdb.roidb
+
+    for i in xrange(num_images):
+        # filter out any ground truth boxes
+        if cfg.TEST.HAS_RPN:
+            box_proposals = None
+        else:
+            # The roidb may contain ground-truth rois (for example, if the roidb
+            # comes from the training or val split). We only want to evaluate
+            # detection on the *non*-ground-truth rois. We select those the rois
+            # that have the gt_classes field set to 0, which means there's no
+            # ground truth.
+            box_proposals = roidb[i]['boxes'][roidb[i]['gt_classes'] == 0]
+
+        im = cv2.imread(imdb.image_path_at(i))
+        _t['im_detect'].tic()
+        scores, boxes = im_detect(net, im, box_proposals)
+        _t['im_detect'].toc()
+
+        _t['misc'].tic()
+        # skip j = 0, because it's the background class
+        for j in xrange(1, imdb.num_classes):
+            inds = np.where(scores[:, j] > thresh)[0]
+            cls_scores = scores[inds, j]
+            cls_boxes = boxes[inds, j*4:(j+1)*4]
+            cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \
+                .astype(np.float32, copy=False)
+            keep = nms(cls_dets, cfg.TEST.NMS)
+            cls_dets = cls_dets[keep, :]
+            if vis:
+                vis_detections(im, imdb.classes[j], cls_dets)
+            all_boxes[j][i] = cls_dets
+
+        # Limit to max_per_image detections *over all classes*
+        if max_per_image > 0:
+            image_scores = np.hstack([all_boxes[j][i][:, -1]
+                                      for j in xrange(1, imdb.num_classes)])
+            if len(image_scores) > max_per_image:
+                image_thresh = np.sort(image_scores)[-max_per_image]
+                for j in xrange(1, imdb.num_classes):
+                    keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0]
+                    all_boxes[j][i] = all_boxes[j][i][keep, :]
+        _t['misc'].toc()
+
+        print 'im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \
+              .format(i + 1, num_images, _t['im_detect'].average_time,
+                      _t['misc'].average_time)
+
+    det_file = os.path.join(output_dir, 'detections.pkl')
+    with open(det_file, 'wb') as f:
+        cPickle.dump(all_boxes, f, cPickle.HIGHEST_PROTOCOL)
+
+    print 'Evaluating detections'
+    imdb.evaluate_detections(all_boxes, output_dir)
diff --git a/examples/faster-rcnn/lib/fast_rcnn/train.py b/examples/faster-rcnn/lib/fast_rcnn/train.py
new file mode 100644
index 000000000..4a6d362ec
--- /dev/null
+++ b/examples/faster-rcnn/lib/fast_rcnn/train.py
@@ -0,0 +1,163 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Train a Fast R-CNN network."""
+
+import caffe
+from fast_rcnn.config import cfg
+import roi_data_layer.roidb as rdl_roidb
+from utils.timer import Timer
+import numpy as np
+import os
+
+from caffe.proto import caffe_pb2
+import google.protobuf as pb2
+import google.protobuf.text_format
+
+class SolverWrapper(object):
+    """A simple wrapper around Caffe's solver.
+    This wrapper gives us control over he snapshotting process, which we
+    use to unnormalize the learned bounding-box regression weights.
+    """
+
+    def __init__(self, solver_prototxt, roidb, output_dir,
+                 pretrained_model=None):
+        """Initialize the SolverWrapper."""
+        self.output_dir = output_dir
+
+        if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and
+            cfg.TRAIN.BBOX_NORMALIZE_TARGETS):
+            # RPN can only use precomputed normalization because there are no
+            # fixed statistics to compute a priori
+            assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED
+
+        if cfg.TRAIN.BBOX_REG:
+            print 'Computing bounding-box regression targets...'
+            self.bbox_means, self.bbox_stds = \
+                    rdl_roidb.add_bbox_regression_targets(roidb)
+            print 'done'
+
+        self.solver = caffe.SGDSolver(solver_prototxt)
+        if pretrained_model is not None:
+            print ('Loading pretrained model '
+                   'weights from {:s}').format(pretrained_model)
+            self.solver.net.copy_from(pretrained_model)
+
+        self.solver_param = caffe_pb2.SolverParameter()
+        with open(solver_prototxt, 'rt') as f:
+            pb2.text_format.Merge(f.read(), self.solver_param)
+
+        self.solver.net.layers[0].set_roidb(roidb)
+
+    def snapshot(self):
+        """Take a snapshot of the network after unnormalizing the learned
+        bounding-box regression weights. This enables easy use at test-time.
+        """
+        net = self.solver.net
+
+        scale_bbox_params = (cfg.TRAIN.BBOX_REG and
+                             cfg.TRAIN.BBOX_NORMALIZE_TARGETS and
+                             net.params.has_key('bbox_pred'))
+
+        if scale_bbox_params:
+            # save original values
+            orig_0 = net.params['bbox_pred'][0].data.copy()
+            orig_1 = net.params['bbox_pred'][1].data.copy()
+
+            # scale and shift with bbox reg unnormalization; then save snapshot
+            net.params['bbox_pred'][0].data[...] = \
+                    (net.params['bbox_pred'][0].data *
+                     self.bbox_stds[:, np.newaxis])
+            net.params['bbox_pred'][1].data[...] = \
+                    (net.params['bbox_pred'][1].data *
+                     self.bbox_stds + self.bbox_means)
+
+        infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX
+                 if cfg.TRAIN.SNAPSHOT_INFIX != '' else '')
+        filename = (self.solver_param.snapshot_prefix + infix +
+                    '_iter_{:d}'.format(self.solver.iter) + '.caffemodel')
+        filename = os.path.join(self.output_dir, filename)
+
+        net.save(str(filename))
+        print 'Wrote snapshot to: {:s}'.format(filename)
+
+        if scale_bbox_params:
+            # restore net to original state
+            net.params['bbox_pred'][0].data[...] = orig_0
+            net.params['bbox_pred'][1].data[...] = orig_1
+        return filename
+
+    def train_model(self, max_iters):
+        """Network training loop."""
+        last_snapshot_iter = -1
+        timer = Timer()
+        model_paths = []
+        while self.solver.iter < max_iters:
+            # Make one SGD update
+            timer.tic()
+            self.solver.step(1)
+            timer.toc()
+            if self.solver.iter % (10 * self.solver_param.display) == 0:
+                print 'speed: {:.3f}s / iter'.format(timer.average_time)
+
+            if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0:
+                last_snapshot_iter = self.solver.iter
+                model_paths.append(self.snapshot())
+
+        if last_snapshot_iter != self.solver.iter:
+            model_paths.append(self.snapshot())
+        return model_paths
+
+def get_training_roidb(imdb):
+    """Returns a roidb (Region of Interest database) for use in training."""
+    if cfg.TRAIN.USE_FLIPPED:
+        print 'Appending horizontally-flipped training examples...'
+        imdb.append_flipped_images()
+        print 'done'
+
+    print 'Preparing training data...'
+    rdl_roidb.prepare_roidb(imdb)
+    print 'done'
+
+    return imdb.roidb
+
+def filter_roidb(roidb):
+    """Remove roidb entries that have no usable RoIs."""
+
+    def is_valid(entry):
+        # Valid images have:
+        #   (1) At least one foreground RoI OR
+        #   (2) At least one background RoI
+        overlaps = entry['max_overlaps']
+        # find boxes with sufficient overlap
+        fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
+        # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
+        bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
+                           (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
+        # image is only valid if such boxes exist
+        valid = len(fg_inds) > 0 or len(bg_inds) > 0
+        return valid
+
+    num = len(roidb)
+    filtered_roidb = [entry for entry in roidb if is_valid(entry)]
+    num_after = len(filtered_roidb)
+    print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after,
+                                                       num, num_after)
+    return filtered_roidb
+
+def train_net(solver_prototxt, roidb, output_dir,
+              pretrained_model=None, max_iters=40000):
+    """Train a Fast R-CNN network."""
+
+    roidb = filter_roidb(roidb)
+    sw = SolverWrapper(solver_prototxt, roidb, output_dir,
+                       pretrained_model=pretrained_model)
+
+    print 'Solving...'
+    model_paths = sw.train_model(max_iters)
+    print 'done solving'
+    return model_paths
diff --git a/examples/faster-rcnn/lib/nms/__init__.py b/examples/faster-rcnn/lib/nms/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/faster-rcnn/lib/nms/cpu_nms.pyx b/examples/faster-rcnn/lib/nms/cpu_nms.pyx
new file mode 100644
index 000000000..625c42df3
--- /dev/null
+++ b/examples/faster-rcnn/lib/nms/cpu_nms.pyx
@@ -0,0 +1,91 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+cimport cython
+cimport openmp
+
+from cython.parallel cimport prange
+from cython.parallel cimport parallel
+
+cdef inline np.float32_t max(np.float32_t a, np.float32_t b) nogil:
+    return a if a >= b else b
+
+cdef inline np.float32_t min(np.float32_t a, np.float32_t b) nogil:
+    return a if a <= b else b
+
+cdef inline np.int_t thresholding(np.float32_t ovr, np.float32_t thresh) nogil:
+    return 1 if ovr >= thresh else 0
+
+import os
+from multiprocessing import cpu_count
+cdef int set_num = 0
+try:
+    set_num = os.environ["OMP_NUM_THREADS"]
+except:
+    set_num = cpu_count() / 2
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+    cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+    cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+    cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+    cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+    cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+
+    cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
+
+    cdef int ndets = dets.shape[0]
+    cdef np.ndarray[np.int_t, ndim=1] suppressed = \
+            np.zeros((ndets), dtype=np.int)
+
+    # nominal indices
+    cdef int _i, _j
+    # sorted indices
+    cdef int i, j
+    # temp variables for box i's (the box currently under consideration)
+    cdef np.float32_t ix1, iy1, ix2, iy2, iarea
+    # variables for computing overlap with box j (lower scoring box)
+    cdef np.float32_t xx1, yy1, xx2, yy2
+    cdef np.float32_t w, h
+    cdef np.float32_t inter, ovr
+    cdef np.float32_t threshc = thresh
+
+    global set_num
+
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        keep.append(i)
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        thread_num = set_num if (ndets - _i - 1) > set_num else (ndets - _i - 1)
+        if thread_num == 0:
+            continue
+        with nogil:
+            for _j in prange(_i + 1, ndets, schedule = 'dynamic', num_threads = thread_num):
+                j = order[_j]
+                if suppressed[j] == 0:
+                    xx1 = max(ix1, x1[j])
+                    yy1 = max(iy1, y1[j])
+                    xx2 = min(ix2, x2[j])
+                    yy2 = min(iy2, y2[j])
+                    w = max(0.0, xx2 - xx1 + 1)
+                    h = max(0.0, yy2 - yy1 + 1)
+                    inter = w * h
+                    ovr = inter / (iarea + areas[j] - inter)
+                    suppressed[j] = thresholding(ovr, threshc)
+
+    return keep
diff --git a/examples/faster-rcnn/lib/nms/gpu_nms.hpp b/examples/faster-rcnn/lib/nms/gpu_nms.hpp
new file mode 100644
index 000000000..68b6d42cd
--- /dev/null
+++ b/examples/faster-rcnn/lib/nms/gpu_nms.hpp
@@ -0,0 +1,2 @@
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id);
diff --git a/examples/faster-rcnn/lib/nms/gpu_nms.pyx b/examples/faster-rcnn/lib/nms/gpu_nms.pyx
new file mode 100644
index 000000000..59d84afe9
--- /dev/null
+++ b/examples/faster-rcnn/lib/nms/gpu_nms.pyx
@@ -0,0 +1,31 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+assert sizeof(int) == sizeof(np.int32_t)
+
+cdef extern from "gpu_nms.hpp":
+    void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
+
+def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
+            np.int32_t device_id=0):
+    cdef int boxes_num = dets.shape[0]
+    cdef int boxes_dim = dets.shape[1]
+    cdef int num_out
+    cdef np.ndarray[np.int32_t, ndim=1] \
+        keep = np.zeros(boxes_num, dtype=np.int32)
+    cdef np.ndarray[np.float32_t, ndim=1] \
+        scores = dets[:, 4]
+    cdef np.ndarray[np.int_t, ndim=1] \
+        order = scores.argsort()[::-1]
+    cdef np.ndarray[np.float32_t, ndim=2] \
+        sorted_dets = dets[order, :]
+    _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
+    keep = keep[:num_out]
+    return list(order[keep])
diff --git a/examples/faster-rcnn/lib/nms/nms_kernel.cu b/examples/faster-rcnn/lib/nms/nms_kernel.cu
new file mode 100644
index 000000000..038a59012
--- /dev/null
+++ b/examples/faster-rcnn/lib/nms/nms_kernel.cu
@@ -0,0 +1,144 @@
+// ------------------------------------------------------------------
+// Faster R-CNN
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Shaoqing Ren
+// ------------------------------------------------------------------
+
+#include "gpu_nms.hpp"
+#include <vector>
+#include <iostream>
+
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    if (error != cudaSuccess) { \
+      std::cout << cudaGetErrorString(error) << std::endl; \
+    } \
+  } while (0)
+
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+__device__ inline float devIoU(float const * const a, float const * const b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+                           const float *dev_boxes, unsigned long long *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ float block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+void _set_device(int device_id) {
+  int current_device;
+  CUDA_CHECK(cudaGetDevice(&current_device));
+  if (current_device == device_id) {
+    return;
+  }
+  // The call to cudaSetDevice must come before any calls to Get, which
+  // may perform initialization using the GPU.
+  CUDA_CHECK(cudaSetDevice(device_id));
+}
+
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id) {
+  _set_device(device_id);
+
+  float* boxes_dev = NULL;
+  unsigned long long* mask_dev = NULL;
+
+  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+
+  CUDA_CHECK(cudaMalloc(&boxes_dev,
+                        boxes_num * boxes_dim * sizeof(float)));
+  CUDA_CHECK(cudaMemcpy(boxes_dev,
+                        boxes_host,
+                        boxes_num * boxes_dim * sizeof(float),
+                        cudaMemcpyHostToDevice));
+
+  CUDA_CHECK(cudaMalloc(&mask_dev,
+                        boxes_num * col_blocks * sizeof(unsigned long long)));
+
+  dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
+              DIVUP(boxes_num, threadsPerBlock));
+  dim3 threads(threadsPerBlock);
+  nms_kernel<<<blocks, threads>>>(boxes_num,
+                                  nms_overlap_thresh,
+                                  boxes_dev,
+                                  mask_dev);
+
+  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
+  CUDA_CHECK(cudaMemcpy(&mask_host[0],
+                        mask_dev,
+                        sizeof(unsigned long long) * boxes_num * col_blocks,
+                        cudaMemcpyDeviceToHost));
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  *num_out = num_to_keep;
+
+  CUDA_CHECK(cudaFree(boxes_dev));
+  CUDA_CHECK(cudaFree(mask_dev));
+}
diff --git a/examples/faster-rcnn/lib/nms/py_cpu_nms.py b/examples/faster-rcnn/lib/nms/py_cpu_nms.py
new file mode 100644
index 000000000..54e7b25fe
--- /dev/null
+++ b/examples/faster-rcnn/lib/nms/py_cpu_nms.py
@@ -0,0 +1,38 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+
+def py_cpu_nms(dets, thresh):
+    """Pure Python NMS baseline."""
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
diff --git a/examples/faster-rcnn/lib/pycocotools/UPSTREAM_REV b/examples/faster-rcnn/lib/pycocotools/UPSTREAM_REV
new file mode 100644
index 000000000..706219b77
--- /dev/null
+++ b/examples/faster-rcnn/lib/pycocotools/UPSTREAM_REV
@@ -0,0 +1 @@
+https://github.com/pdollar/coco/commit/3ac47c77ebd5a1ed4254a98b7fbf2ef4765a3574
diff --git a/examples/faster-rcnn/lib/pycocotools/__init__.py b/examples/faster-rcnn/lib/pycocotools/__init__.py
new file mode 100644
index 000000000..3f7d85bba
--- /dev/null
+++ b/examples/faster-rcnn/lib/pycocotools/__init__.py
@@ -0,0 +1 @@
+__author__ = 'tylin'
diff --git a/examples/faster-rcnn/lib/pycocotools/_mask.c b/examples/faster-rcnn/lib/pycocotools/_mask.c
new file mode 100644
index 000000000..e47a984fc
--- /dev/null
+++ b/examples/faster-rcnn/lib/pycocotools/_mask.c
@@ -0,0 +1,15550 @@
+/* Generated by Cython 0.25.2 */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03020000)
+    #error Cython requires Python 2.6+ or Python 3.2+.
+#else
+#define CYTHON_ABI "0_25_2"
+#include <stddef.h>
+#ifndef offsetof
+  #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#ifndef HAVE_LONG_LONG
+  #if PY_VERSION_HEX >= 0x03030000 || (PY_MAJOR_VERSION == 2 && PY_VERSION_HEX >= 0x02070000)
+    #define HAVE_LONG_LONG
+  #endif
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+  #define CYTHON_COMPILING_IN_PYPY 1
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_ASYNC_SLOTS
+  #define CYTHON_USE_ASYNC_SLOTS 0
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+#elif defined(PYSTON_VERSION)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 1
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #undef CYTHON_USE_ASYNC_SLOTS
+  #define CYTHON_USE_ASYNC_SLOTS 0
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+#else
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 1
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #if PY_MAJOR_VERSION < 3
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 0
+  #elif !defined(CYTHON_USE_PYLONG_INTERNALS)
+    #define CYTHON_USE_PYLONG_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_PYLIST_INTERNALS
+    #define CYTHON_USE_PYLIST_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #if PY_VERSION_HEX < 0x030300F0
+    #undef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #elif !defined(CYTHON_USE_UNICODE_WRITER)
+    #define CYTHON_USE_UNICODE_WRITER 1
+  #endif
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #ifndef CYTHON_FAST_THREAD_STATE
+    #define CYTHON_FAST_THREAD_STATE 1
+  #endif
+  #ifndef CYTHON_FAST_PYCALL
+    #define CYTHON_FAST_PYCALL 1
+  #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL  (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+  #undef SHIFT
+  #undef BASE
+  #undef MASK
+#endif
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+  #define Py_OptimizeFlag 0
+#endif
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyClass_Type
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyType_Type
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+  #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#ifndef METH_FASTCALL
+  #define METH_FASTCALL 0x80
+  typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject **args,
+                                              Py_ssize_t nargs, PyObject *kwnames);
+#else
+  #define __Pyx_PyCFunctionFast _PyCFunctionFast
+#endif
+#if CYTHON_FAST_PYCCALL
+#define __Pyx_PyFastCFunction_Check(func)\
+    ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST)))))
+#else
+#define __Pyx_PyFastCFunction_Check(func) 0
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ?\
+                                              0 : _PyUnicode_Ready((PyObject *)(op)))
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   PyUnicode_MAX_CHAR_VALUE(u)
+  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define PyUnicode_1BYTE_KIND  1
+  #define PyUnicode_2BYTE_KIND  2
+  #define PyUnicode_4BYTE_KIND  4
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111)
+  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  (((void)(k)), ((Py_UNICODE*)d)[i] = ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
+  #define PyUnicode_Contains(u, s)  PySequence_Contains(u, s)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
+  #define PyByteArray_Check(obj)  PyObject_TypeCheck(obj, &PyByteArray_Type)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
+  #define PyObject_Format(obj, fmt)  PyObject_CallMethod(obj, "__format__", "O", fmt)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+  #define PyObject_Malloc(s)   PyMem_Malloc(s)
+  #define PyObject_Free(p)     PyMem_Free(p)
+  #define PyObject_Realloc(p)  PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_PYSTON
+  #define __Pyx_PyCode_HasFreeVars(co)  PyCode_HasFreeVars(co)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno)
+#else
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)  (frame)->f_lineno = (lineno)
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)   ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+  #define PyObject_ASCII(o)            PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
+#endif
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+  #ifndef PyUnicode_InternFromString
+    #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+  #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsLong
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsSsize_t
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : PyInstanceMethod_New(func))
+#else
+  #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass)
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+  #if PY_VERSION_HEX >= 0x030500B1
+    #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+    #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+  #else
+    typedef struct {
+        unaryfunc am_await;
+        unaryfunc am_aiter;
+        unaryfunc am_anext;
+    } __Pyx_PyAsyncMethodsStruct;
+    #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+  #endif
+#else
+  #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+#  if defined(__cplusplus)
+     template<class T> void CYTHON_MAYBE_UNUSED_VAR( const T& ) { }
+#  else
+#    define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x)
+#  endif
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+#  define CYTHON_NCP_UNUSED
+# else
+#  define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+
+#ifndef CYTHON_INLINE
+  #if defined(__clang__)
+    #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+  #elif defined(__GNUC__)
+    #define CYTHON_INLINE __inline__
+  #elif defined(_MSC_VER)
+    #define CYTHON_INLINE __inline
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_INLINE inline
+  #else
+    #define CYTHON_INLINE
+  #endif
+#endif
+
+#if defined(WIN32) || defined(MS_WINDOWS)
+  #define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+{ \
+  __pyx_filename = __pyx_f[f_index]; __pyx_lineno = lineno; __pyx_clineno = __LINE__; goto Ln_error; \
+}
+
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+
+#ifndef __PYX_EXTERN_C
+  #ifdef __cplusplus
+    #define __PYX_EXTERN_C extern "C"
+  #else
+    #define __PYX_EXTERN_C extern
+  #endif
+#endif
+
+#define __PYX_HAVE__pycocotools___mask
+#define __PYX_HAVE_API__pycocotools___mask
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "maskApi.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#ifdef PYREX_WITHOUT_ASSERTIONS
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (\
+    (sizeof(type) < sizeof(Py_ssize_t))  ||\
+    (sizeof(type) > sizeof(Py_ssize_t) &&\
+          likely(v < (type)PY_SSIZE_T_MAX ||\
+                 v == (type)PY_SSIZE_T_MAX)  &&\
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+                                v == (type)PY_SSIZE_T_MIN)))  ||\
+    (sizeof(type) == sizeof(Py_ssize_t) &&\
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+                               v == (type)PY_SSIZE_T_MAX)))  )
+#if defined (__cplusplus) && __cplusplus >= 201103L
+    #include <cstdlib>
+    #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER) && defined (_M_X64)
+    #define __Pyx_sst_abs(value) _abs64(value)
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+    #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+    #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyObject_AsSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s)  __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s)   __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s)   __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s)     __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+#if PY_MAJOR_VERSION < 3
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u)
+{
+    const Py_UNICODE *u_end = u;
+    while (*u_end++) ;
+    return (size_t)(u_end - u - 1);
+}
+#else
+#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen
+#endif
+#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+#define __Pyx_PyBool_FromLong(b) ((b) ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False))
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    const char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    if (strcmp(default_encoding_c, "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (!ascii_chars_u) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+        Py_DECREF(ascii_chars_u);
+        Py_DECREF(ascii_chars_b);
+    }
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c));
+    if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__)     && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+  #define likely(x)   __builtin_expect(!!(x), 1)
+  #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+
+static PyObject *__pyx_m;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static PyObject *__pyx_empty_unicode;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+/* Header.proto */
+#if !defined(CYTHON_CCOMPLEX)
+  #if defined(__cplusplus)
+    #define CYTHON_CCOMPLEX 1
+  #elif defined(_Complex_I)
+    #define CYTHON_CCOMPLEX 1
+  #else
+    #define CYTHON_CCOMPLEX 0
+  #endif
+#endif
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #include <complex>
+  #else
+    #include <complex.h>
+  #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+  #undef _Complex_I
+  #define _Complex_I 1.0fj
+#endif
+
+
+static const char *__pyx_f[] = {
+  "pycocotools/_mask.pyx",
+  "__init__.pxd",
+  "type.pxd",
+};
+/* BufferFormatStructs.proto */
+#define IS_UNSIGNED(type) (((type) -1) > 0)
+struct __Pyx_StructField_;
+#define __PYX_BUF_FLAGS_PACKED_STRUCT (1 << 0)
+typedef struct {
+  const char* name;
+  struct __Pyx_StructField_* fields;
+  size_t size;
+  size_t arraysize[8];
+  int ndim;
+  char typegroup;
+  char is_unsigned;
+  int flags;
+} __Pyx_TypeInfo;
+typedef struct __Pyx_StructField_ {
+  __Pyx_TypeInfo* type;
+  const char* name;
+  size_t offset;
+} __Pyx_StructField;
+typedef struct {
+  __Pyx_StructField* field;
+  size_t parent_offset;
+} __Pyx_BufFmt_StackElem;
+typedef struct {
+  __Pyx_StructField root;
+  __Pyx_BufFmt_StackElem* head;
+  size_t fmt_offset;
+  size_t new_count, enc_count;
+  size_t struct_alignment;
+  int is_complex;
+  char enc_type;
+  char new_packmode;
+  char enc_packmode;
+  char is_valid_array;
+} __Pyx_BufFmt_Context;
+
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":725
+ * # in Cython to enable them only on the right systems.
+ * 
+ * ctypedef npy_int8       int8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":726
+ * 
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":727
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int64      int64_t
+ * #ctypedef npy_int96      int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":728
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96      int96_t
+ * #ctypedef npy_int128     int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":732
+ * #ctypedef npy_int128     int128_t
+ * 
+ * ctypedef npy_uint8      uint8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":733
+ * 
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":734
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64     uint64_t
+ * #ctypedef npy_uint96     uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":735
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96     uint96_t
+ * #ctypedef npy_uint128    uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":739
+ * #ctypedef npy_uint128    uint128_t
+ * 
+ * ctypedef npy_float32    float32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_float64    float64_t
+ * #ctypedef npy_float80    float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":740
+ * 
+ * ctypedef npy_float32    float32_t
+ * ctypedef npy_float64    float64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80    float80_t
+ * #ctypedef npy_float128   float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":749
+ * # The int types are mapped a bit surprising --
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t
+ */
+typedef npy_long __pyx_t_5numpy_int_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":750
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   longlong_t
+ * 
+ */
+typedef npy_longlong __pyx_t_5numpy_long_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":751
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_ulong      uint_t
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":753
+ * ctypedef npy_longlong   longlong_t
+ * 
+ * ctypedef npy_ulong      uint_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t
+ */
+typedef npy_ulong __pyx_t_5numpy_uint_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":754
+ * 
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":755
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_intp       intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":757
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ * ctypedef npy_intp       intp_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp      uintp_t
+ * 
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":758
+ * 
+ * ctypedef npy_intp       intp_t
+ * ctypedef npy_uintp      uintp_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_double     float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":760
+ * ctypedef npy_uintp      uintp_t
+ * 
+ * ctypedef npy_double     float_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":761
+ * 
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":762
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< float > __pyx_t_float_complex;
+  #else
+    typedef float _Complex __pyx_t_float_complex;
+  #endif
+#else
+    typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< double > __pyx_t_double_complex;
+  #else
+    typedef double _Complex __pyx_t_double_complex;
+  #endif
+#else
+    typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+
+/*--- Type declarations ---*/
+struct __pyx_obj_11pycocotools_5_mask_RLEs;
+struct __pyx_obj_11pycocotools_5_mask_Masks;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":764
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ * ctypedef npy_cfloat      cfloat_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t
+ */
+typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":765
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ */
+typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":766
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cdouble     complex_t
+ */
+typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":768
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ * ctypedef npy_cdouble     complex_t             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ */
+typedef npy_cdouble __pyx_t_5numpy_complex_t;
+
+/* "pycocotools/_mask.pyx":53
+ * # python class to wrap RLE array in C
+ * # the class handles the memory allocation and deallocation
+ * cdef class RLEs:             # <<<<<<<<<<<<<<
+ *     cdef RLE *_R
+ *     cdef siz _n
+ */
+struct __pyx_obj_11pycocotools_5_mask_RLEs {
+  PyObject_HEAD
+  RLE *_R;
+  siz _n;
+};
+
+
+/* "pycocotools/_mask.pyx":74
+ * # python class to wrap Mask array in C
+ * # the class handles the memory allocation and deallocation
+ * cdef class Masks:             # <<<<<<<<<<<<<<
+ *     cdef byte *_mask
+ *     cdef siz _h
+ */
+struct __pyx_obj_11pycocotools_5_mask_Masks {
+  PyObject_HEAD
+  byte *_mask;
+  siz _h;
+  siz _w;
+  siz _n;
+};
+
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, int);
+    void (*DECREF)(void*, PyObject*, int);
+    void (*GOTREF)(void*, PyObject*, int);
+    void (*GIVEREF)(void*, PyObject*, int);
+    void* (*SetupContext)(const char*, int, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          if (acquire_gil) {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+              PyGILState_Release(__pyx_gilstate_save);\
+          } else {\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+  #define __Pyx_RefNannyFinishContext()\
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_XINCREF(r)  do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_DECREF(tmp);\
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* GetBuiltinName.proto */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name);
+
+/* RaiseDoubleKeywords.proto */
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
+
+/* ParseKeywords.proto */
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\
+    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,\
+    const char* function_name);
+
+/* RaiseArgTupleInvalid.proto */
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
+
+/* IncludeStringH.proto */
+#include <string.h>
+
+/* BytesEquals.proto */
+static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals);
+
+/* UnicodeEquals.proto */
+static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals);
+
+/* StrEquals.proto */
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyString_Equals __Pyx_PyUnicode_Equals
+#else
+#define __Pyx_PyString_Equals __Pyx_PyBytes_Equals
+#endif
+
+/* PyObjectCall.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare  PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign  __pyx_tstate = PyThreadState_GET();
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ErrRestoreWithState(type, value, tb)  __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)    __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)    __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+#define __Pyx_ErrRestoreWithState(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)  PyErr_Fetch(type, value, tb)
+#endif
+
+/* RaiseException.proto */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause);
+
+/* ExtTypeTest.proto */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type);
+
+/* ArgTypeTest.proto */
+static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed,
+    const char *name, int exact);
+
+/* ListAppend.proto */
+#if CYTHON_USE_PYLIST_INTERNALS && CYTHON_ASSUME_SAFE_MACROS
+static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) {
+    PyListObject* L = (PyListObject*) list;
+    Py_ssize_t len = Py_SIZE(list);
+    if (likely(L->allocated > len) & likely(len > (L->allocated >> 1))) {
+        Py_INCREF(x);
+        PyList_SET_ITEM(list, len, x);
+        Py_SIZE(list) = len+1;
+        return 0;
+    }
+    return PyList_Append(list, x);
+}
+#else
+#define __Pyx_PyList_Append(L,x) PyList_Append(L,x)
+#endif
+
+/* PyIntBinop.proto */
+#if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, long intval, int inplace);
+#else
+#define __Pyx_PyInt_AddObjC(op1, op2, intval, inplace)\
+    (inplace ? PyNumber_InPlaceAdd(op1, op2) : PyNumber_Add(op1, op2))
+#endif
+
+/* GetItemInt.proto */
+#define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) :\
+    (is_list ? (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL) :\
+               __Pyx_GetItemInt_Generic(o, to_py_func(i))))
+#define __Pyx_GetItemInt_List(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_List_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+    (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+                                                              int wraparound, int boundscheck);
+#define __Pyx_GetItemInt_Tuple(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_Tuple_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+    (PyErr_SetString(PyExc_IndexError, "tuple index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+                                                              int wraparound, int boundscheck);
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j);
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i,
+                                                     int is_list, int wraparound, int boundscheck);
+
+/* BufferFormatCheck.proto */
+static CYTHON_INLINE int  __Pyx_GetBufferAndValidate(Py_buffer* buf, PyObject* obj,
+    __Pyx_TypeInfo* dtype, int flags, int nd, int cast, __Pyx_BufFmt_StackElem* stack);
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info);
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts);
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type); // PROTO
+
+/* GetModuleGlobalName.proto */
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name);
+
+/* PyCFunctionFastCall.proto */
+#if CYTHON_FAST_PYCCALL
+static CYTHON_INLINE PyObject *__Pyx_PyCFunction_FastCall(PyObject *func, PyObject **args, Py_ssize_t nargs);
+#else
+#define __Pyx_PyCFunction_FastCall(func, args, nargs)  (assert(0), NULL)
+#endif
+
+/* PyFunctionFastCall.proto */
+#if CYTHON_FAST_PYCALL
+#define __Pyx_PyFunction_FastCall(func, args, nargs)\
+    __Pyx_PyFunction_FastCallDict((func), (args), (nargs), NULL)
+#if 1 || PY_VERSION_HEX < 0x030600B1
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs);
+#else
+#define __Pyx_PyFunction_FastCallDict(func, args, nargs, kwargs) _PyFunction_FastCallDict(func, args, nargs, kwargs)
+#endif
+#endif
+
+/* PyObjectCallMethO.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg);
+#endif
+
+/* PyObjectCallOneArg.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg);
+
+/* PyIntBinop.proto */
+#if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_EqObjC(PyObject *op1, PyObject *op2, long intval, int inplace);
+#else
+#define __Pyx_PyInt_EqObjC(op1, op2, intval, inplace)\
+    PyObject_RichCompare(op1, op2, Py_EQ)
+    #endif
+
+/* ListCompAppend.proto */
+#if CYTHON_USE_PYLIST_INTERNALS && CYTHON_ASSUME_SAFE_MACROS
+static CYTHON_INLINE int __Pyx_ListComp_Append(PyObject* list, PyObject* x) {
+    PyListObject* L = (PyListObject*) list;
+    Py_ssize_t len = Py_SIZE(list);
+    if (likely(L->allocated > len)) {
+        Py_INCREF(x);
+        PyList_SET_ITEM(list, len, x);
+        Py_SIZE(list) = len+1;
+        return 0;
+    }
+    return PyList_Append(list, x);
+}
+#else
+#define __Pyx_ListComp_Append(L,x) PyList_Append(L,x)
+#endif
+
+/* FetchCommonType.proto */
+static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type);
+
+/* CythonFunction.proto */
+#define __Pyx_CyFunction_USED 1
+#include <structmember.h>
+#define __Pyx_CYFUNCTION_STATICMETHOD  0x01
+#define __Pyx_CYFUNCTION_CLASSMETHOD   0x02
+#define __Pyx_CYFUNCTION_CCLASS        0x04
+#define __Pyx_CyFunction_GetClosure(f)\
+    (((__pyx_CyFunctionObject *) (f))->func_closure)
+#define __Pyx_CyFunction_GetClassObj(f)\
+    (((__pyx_CyFunctionObject *) (f))->func_classobj)
+#define __Pyx_CyFunction_Defaults(type, f)\
+    ((type *)(((__pyx_CyFunctionObject *) (f))->defaults))
+#define __Pyx_CyFunction_SetDefaultsGetter(f, g)\
+    ((__pyx_CyFunctionObject *) (f))->defaults_getter = (g)
+typedef struct {
+    PyCFunctionObject func;
+#if PY_VERSION_HEX < 0x030500A0
+    PyObject *func_weakreflist;
+#endif
+    PyObject *func_dict;
+    PyObject *func_name;
+    PyObject *func_qualname;
+    PyObject *func_doc;
+    PyObject *func_globals;
+    PyObject *func_code;
+    PyObject *func_closure;
+    PyObject *func_classobj;
+    void *defaults;
+    int defaults_pyobjects;
+    int flags;
+    PyObject *defaults_tuple;
+    PyObject *defaults_kwdict;
+    PyObject *(*defaults_getter)(PyObject *);
+    PyObject *func_annotations;
+} __pyx_CyFunctionObject;
+static PyTypeObject *__pyx_CyFunctionType = 0;
+#define __Pyx_CyFunction_NewEx(ml, flags, qualname, self, module, globals, code)\
+    __Pyx_CyFunction_New(__pyx_CyFunctionType, ml, flags, qualname, self, module, globals, code)
+static PyObject *__Pyx_CyFunction_New(PyTypeObject *, PyMethodDef *ml,
+                                      int flags, PyObject* qualname,
+                                      PyObject *self,
+                                      PyObject *module, PyObject *globals,
+                                      PyObject* code);
+static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *m,
+                                                         size_t size,
+                                                         int pyobjects);
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *m,
+                                                            PyObject *tuple);
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *m,
+                                                             PyObject *dict);
+static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *m,
+                                                              PyObject *dict);
+static int __pyx_CyFunction_init(void);
+
+/* BufferFallbackError.proto */
+static void __Pyx_RaiseBufferFallbackError(void);
+
+/* None.proto */
+static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t, Py_ssize_t);
+
+/* BufferIndexError.proto */
+static void __Pyx_RaiseBufferIndexError(int axis);
+
+#define __Pyx_BufPtrStrided1d(type, buf, i0, s0) (type)((char*)buf + i0 * s0)
+/* DictGetItem.proto */
+#if PY_MAJOR_VERSION >= 3 && !CYTHON_COMPILING_IN_PYPY
+static PyObject *__Pyx_PyDict_GetItem(PyObject *d, PyObject* key) {
+    PyObject *value;
+    value = PyDict_GetItemWithError(d, key);
+    if (unlikely(!value)) {
+        if (!PyErr_Occurred()) {
+            PyObject* args = PyTuple_Pack(1, key);
+            if (likely(args))
+                PyErr_SetObject(PyExc_KeyError, args);
+            Py_XDECREF(args);
+        }
+        return NULL;
+    }
+    Py_INCREF(value);
+    return value;
+}
+#else
+    #define __Pyx_PyDict_GetItem(d, key) PyObject_GetItem(d, key)
+#endif
+
+/* RaiseTooManyValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
+
+/* RaiseNeedMoreValuesToUnpack.proto */
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
+
+/* RaiseNoneIterError.proto */
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void);
+
+/* SaveResetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSave(type, value, tb)  __Pyx__ExceptionSave(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#define __Pyx_ExceptionReset(type, value, tb)  __Pyx__ExceptionReset(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+#else
+#define __Pyx_ExceptionSave(type, value, tb)   PyErr_GetExcInfo(type, value, tb)
+#define __Pyx_ExceptionReset(type, value, tb)  PyErr_SetExcInfo(type, value, tb)
+#endif
+
+/* PyErrExceptionMatches.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_ExceptionMatches(err) __Pyx_PyErr_ExceptionMatchesInState(__pyx_tstate, err)
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err);
+#else
+#define __Pyx_PyErr_ExceptionMatches(err)  PyErr_ExceptionMatches(err)
+#endif
+
+/* GetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_GetException(type, value, tb)  __Pyx__GetException(__pyx_tstate, type, value, tb)
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* CodeObjectCache.proto */
+typedef struct {
+    PyCodeObject* code_object;
+    int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename);
+
+/* BufferStructDeclare.proto */
+typedef struct {
+  Py_ssize_t shape, strides, suboffsets;
+} __Pyx_Buf_DimInfo;
+typedef struct {
+  size_t refcount;
+  Py_buffer pybuffer;
+} __Pyx_Buffer;
+typedef struct {
+  __Pyx_Buffer *rcbuffer;
+  char *data;
+  __Pyx_Buf_DimInfo diminfo[8];
+} __Pyx_LocalBuf_ND;
+
+#if PY_MAJOR_VERSION < 3
+    static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags);
+    static void __Pyx_ReleaseBuffer(Py_buffer *view);
+#else
+    #define __Pyx_GetBuffer PyObject_GetBuffer
+    #define __Pyx_ReleaseBuffer PyBuffer_Release
+#endif
+
+
+/* None.proto */
+static Py_ssize_t __Pyx_zeros[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static Py_ssize_t __Pyx_minusones[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_siz(siz value);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_Py_intptr_t(Py_intptr_t value);
+
+/* RealImag.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #define __Pyx_CREAL(z) ((z).real())
+    #define __Pyx_CIMAG(z) ((z).imag())
+  #else
+    #define __Pyx_CREAL(z) (__real__(z))
+    #define __Pyx_CIMAG(z) (__imag__(z))
+  #endif
+#else
+    #define __Pyx_CREAL(z) ((z).real)
+    #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if defined(__cplusplus) && CYTHON_CCOMPLEX\
+        && (defined(_WIN32) || defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )) || __cplusplus >= 201103)
+    #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+    #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+    #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+    #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq_float(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_float(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_float(a, b) ((a)-(b))
+    #define __Pyx_c_prod_float(a, b) ((a)*(b))
+    #define __Pyx_c_quot_float(a, b) ((a)/(b))
+    #define __Pyx_c_neg_float(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_float(z) ((z)==(float)0)
+    #define __Pyx_c_conj_float(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (::std::abs(z))
+        #define __Pyx_c_pow_float(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_float(z) ((z)==0)
+    #define __Pyx_c_conj_float(z)    (conjf(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (cabsf(z))
+        #define __Pyx_c_pow_float(a, b)  (cpowf(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex);
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex);
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq_double(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_double(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_double(a, b) ((a)-(b))
+    #define __Pyx_c_prod_double(a, b) ((a)*(b))
+    #define __Pyx_c_quot_double(a, b) ((a)/(b))
+    #define __Pyx_c_neg_double(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_double(z) ((z)==(double)0)
+    #define __Pyx_c_conj_double(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (::std::abs(z))
+        #define __Pyx_c_pow_double(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_double(z) ((z)==0)
+    #define __Pyx_c_conj_double(z)    (conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (cabs(z))
+        #define __Pyx_c_pow_double(a, b)  (cpow(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex);
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex);
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    #endif
+#endif
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__NPY_TYPES(enum NPY_TYPES value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE siz __Pyx_PyInt_As_siz(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* CheckBinaryVersion.proto */
+static int __Pyx_check_binary_version(void);
+
+/* PyIdentifierFromString.proto */
+#if !defined(__Pyx_PyIdentifier_FromString)
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s)
+#else
+  #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s)
+#endif
+#endif
+
+/* ModuleImport.proto */
+static PyObject *__Pyx_ImportModule(const char *name);
+
+/* TypeImport.proto */
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name, size_t size, int strict);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+
+/* Module declarations from 'cpython.buffer' */
+
+/* Module declarations from 'libc.string' */
+
+/* Module declarations from 'libc.stdio' */
+
+/* Module declarations from '__builtin__' */
+
+/* Module declarations from 'cpython.type' */
+static PyTypeObject *__pyx_ptype_7cpython_4type_type = 0;
+
+/* Module declarations from 'cpython' */
+
+/* Module declarations from 'cpython.object' */
+
+/* Module declarations from 'cpython.ref' */
+
+/* Module declarations from 'libc.stdlib' */
+
+/* Module declarations from 'numpy' */
+
+/* Module declarations from 'numpy' */
+static PyTypeObject *__pyx_ptype_5numpy_dtype = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flatiter = 0;
+static PyTypeObject *__pyx_ptype_5numpy_broadcast = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ndarray = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0;
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *, char *, char *, int *); /*proto*/
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void); /*proto*/
+
+/* Module declarations from 'pycocotools._mask' */
+static PyTypeObject *__pyx_ptype_11pycocotools_5_mask_RLEs = 0;
+static PyTypeObject *__pyx_ptype_11pycocotools_5_mask_Masks = 0;
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_uint8_t = { "uint8_t", NULL, sizeof(__pyx_t_5numpy_uint8_t), { 0 }, 0, IS_UNSIGNED(__pyx_t_5numpy_uint8_t) ? 'U' : 'I', IS_UNSIGNED(__pyx_t_5numpy_uint8_t), 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_double_t = { "double_t", NULL, sizeof(__pyx_t_5numpy_double_t), { 0 }, 0, 'R', 0, 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_uint32_t = { "uint32_t", NULL, sizeof(__pyx_t_5numpy_uint32_t), { 0 }, 0, IS_UNSIGNED(__pyx_t_5numpy_uint32_t) ? 'U' : 'I', IS_UNSIGNED(__pyx_t_5numpy_uint32_t), 0 };
+#define __Pyx_MODULE_NAME "pycocotools._mask"
+int __pyx_module_is_main_pycocotools___mask = 0;
+
+/* Implementation of 'pycocotools._mask' */
+static PyObject *__pyx_builtin_range;
+static PyObject *__pyx_builtin_AttributeError;
+static PyObject *__pyx_builtin_enumerate;
+static PyObject *__pyx_builtin_ValueError;
+static PyObject *__pyx_builtin_RuntimeError;
+static PyObject *__pyx_builtin_ImportError;
+static const char __pyx_k_F[] = "F";
+static const char __pyx_k_N[] = "N";
+static const char __pyx_k_R[] = "R";
+static const char __pyx_k_a[] = "_a";
+static const char __pyx_k_h[] = "h";
+static const char __pyx_k_i[] = "i";
+static const char __pyx_k_j[] = "j";
+static const char __pyx_k_m[] = "m";
+static const char __pyx_k_n[] = "n";
+static const char __pyx_k_p[] = "p";
+static const char __pyx_k_w[] = "w";
+static const char __pyx_k_Rs[] = "Rs";
+static const char __pyx_k_bb[] = "bb";
+static const char __pyx_k_dt[] = "dt";
+static const char __pyx_k_gt[] = "gt";
+static const char __pyx_k_np[] = "np";
+static const char __pyx_k_a_2[] = "a";
+static const char __pyx_k_all[] = "all";
+static const char __pyx_k_iou[] = "_iou";
+static const char __pyx_k_len[] = "_len";
+static const char __pyx_k_obj[] = "obj";
+static const char __pyx_k_area[] = "area";
+static const char __pyx_k_bb_2[] = "_bb";
+static const char __pyx_k_cnts[] = "cnts";
+static const char __pyx_k_data[] = "data";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_mask[] = "mask";
+static const char __pyx_k_objs[] = "objs";
+static const char __pyx_k_poly[] = "poly";
+static const char __pyx_k_size[] = "size";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_array[] = "array";
+static const char __pyx_k_bbIou[] = "_bbIou";
+static const char __pyx_k_dtype[] = "dtype";
+static const char __pyx_k_iou_2[] = "iou";
+static const char __pyx_k_isbox[] = "isbox";
+static const char __pyx_k_isrle[] = "isrle";
+static const char __pyx_k_masks[] = "masks";
+static const char __pyx_k_merge[] = "merge";
+static const char __pyx_k_numpy[] = "numpy";
+static const char __pyx_k_order[] = "order";
+static const char __pyx_k_pyobj[] = "pyobj";
+static const char __pyx_k_range[] = "range";
+static const char __pyx_k_shape[] = "shape";
+static const char __pyx_k_uint8[] = "uint8";
+static const char __pyx_k_zeros[] = "zeros";
+static const char __pyx_k_astype[] = "astype";
+static const char __pyx_k_author[] = "__author__";
+static const char __pyx_k_counts[] = "counts";
+static const char __pyx_k_decode[] = "decode";
+static const char __pyx_k_double[] = "double";
+static const char __pyx_k_encode[] = "encode";
+static const char __pyx_k_frBbox[] = "frBbox";
+static const char __pyx_k_frPoly[] = "frPoly";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_iouFun[] = "_iouFun";
+static const char __pyx_k_rleIou[] = "_rleIou";
+static const char __pyx_k_toBbox[] = "toBbox";
+static const char __pyx_k_ucRles[] = "ucRles";
+static const char __pyx_k_uint32[] = "uint32";
+static const char __pyx_k_iscrowd[] = "iscrowd";
+static const char __pyx_k_np_poly[] = "np_poly";
+static const char __pyx_k_preproc[] = "_preproc";
+static const char __pyx_k_reshape[] = "reshape";
+static const char __pyx_k_rleObjs[] = "rleObjs";
+static const char __pyx_k_tsungyi[] = "tsungyi";
+static const char __pyx_k_c_string[] = "c_string";
+static const char __pyx_k_frString[] = "_frString";
+static const char __pyx_k_toString[] = "_toString";
+static const char __pyx_k_enumerate[] = "enumerate";
+static const char __pyx_k_intersect[] = "intersect";
+static const char __pyx_k_py_string[] = "py_string";
+static const char __pyx_k_pyiscrowd[] = "pyiscrowd";
+static const char __pyx_k_ValueError[] = "ValueError";
+static const char __pyx_k_ImportError[] = "ImportError";
+static const char __pyx_k_frPyObjects[] = "frPyObjects";
+static const char __pyx_k_RuntimeError[] = "RuntimeError";
+static const char __pyx_k_AttributeError[] = "AttributeError";
+static const char __pyx_k_iou_locals__len[] = "iou.<locals>._len";
+static const char __pyx_k_frUncompressedRLE[] = "frUncompressedRLE";
+static const char __pyx_k_iou_locals__bbIou[] = "iou.<locals>._bbIou";
+static const char __pyx_k_pycocotools__mask[] = "pycocotools._mask";
+static const char __pyx_k_iou_locals__rleIou[] = "iou.<locals>._rleIou";
+static const char __pyx_k_iou_locals__preproc[] = "iou.<locals>._preproc";
+static const char __pyx_k_input_data_type_not_allowed[] = "input data type not allowed.";
+static const char __pyx_k_input_type_is_not_supported[] = "input type is not supported.";
+static const char __pyx_k_ndarray_is_not_C_contiguous[] = "ndarray is not C contiguous";
+static const char __pyx_k_home_ftian_code_py_faster_rcnn[] = "/home/ftian/code/py-faster-rcnn/lib/pycocotools/_mask.pyx";
+static const char __pyx_k_numpy_core_multiarray_failed_to[] = "numpy.core.multiarray failed to import";
+static const char __pyx_k_numpy_ndarray_input_is_only_for[] = "numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension";
+static const char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype code in numpy.pxd (%d)";
+static const char __pyx_k_unrecognized_type_The_following[] = "unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.";
+static const char __pyx_k_Format_string_allocated_too_shor[] = "Format string allocated too short, see comment in numpy.pxd";
+static const char __pyx_k_Non_native_byte_order_not_suppor[] = "Non-native byte order not supported";
+static const char __pyx_k_The_dt_and_gt_should_have_the_sa[] = "The dt and gt should have the same data type, either RLEs, list or np.ndarray";
+static const char __pyx_k_list_input_can_be_bounding_box_N[] = "list input can be bounding box (Nx4) or RLEs ([RLE])";
+static const char __pyx_k_ndarray_is_not_Fortran_contiguou[] = "ndarray is not Fortran contiguous";
+static const char __pyx_k_numpy_core_umath_failed_to_impor[] = "numpy.core.umath failed to import";
+static const char __pyx_k_Format_string_allocated_too_shor_2[] = "Format string allocated too short.";
+static PyObject *__pyx_n_s_AttributeError;
+static PyObject *__pyx_n_s_F;
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor;
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor_2;
+static PyObject *__pyx_n_s_ImportError;
+static PyObject *__pyx_n_s_N;
+static PyObject *__pyx_kp_u_Non_native_byte_order_not_suppor;
+static PyObject *__pyx_n_s_R;
+static PyObject *__pyx_n_s_Rs;
+static PyObject *__pyx_n_s_RuntimeError;
+static PyObject *__pyx_kp_s_The_dt_and_gt_should_have_the_sa;
+static PyObject *__pyx_n_s_ValueError;
+static PyObject *__pyx_n_s_a;
+static PyObject *__pyx_n_s_a_2;
+static PyObject *__pyx_n_s_all;
+static PyObject *__pyx_n_s_area;
+static PyObject *__pyx_n_s_array;
+static PyObject *__pyx_n_s_astype;
+static PyObject *__pyx_n_s_author;
+static PyObject *__pyx_n_s_bb;
+static PyObject *__pyx_n_s_bbIou;
+static PyObject *__pyx_n_s_bb_2;
+static PyObject *__pyx_n_s_c_string;
+static PyObject *__pyx_n_s_cnts;
+static PyObject *__pyx_n_s_counts;
+static PyObject *__pyx_n_s_data;
+static PyObject *__pyx_n_s_decode;
+static PyObject *__pyx_n_s_double;
+static PyObject *__pyx_n_s_dt;
+static PyObject *__pyx_n_s_dtype;
+static PyObject *__pyx_n_s_encode;
+static PyObject *__pyx_n_s_enumerate;
+static PyObject *__pyx_n_s_frBbox;
+static PyObject *__pyx_n_s_frPoly;
+static PyObject *__pyx_n_s_frPyObjects;
+static PyObject *__pyx_n_s_frString;
+static PyObject *__pyx_n_s_frUncompressedRLE;
+static PyObject *__pyx_n_s_gt;
+static PyObject *__pyx_n_s_h;
+static PyObject *__pyx_kp_s_home_ftian_code_py_faster_rcnn;
+static PyObject *__pyx_n_s_i;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_kp_s_input_data_type_not_allowed;
+static PyObject *__pyx_kp_s_input_type_is_not_supported;
+static PyObject *__pyx_n_s_intersect;
+static PyObject *__pyx_n_s_iou;
+static PyObject *__pyx_n_s_iouFun;
+static PyObject *__pyx_n_s_iou_2;
+static PyObject *__pyx_n_s_iou_locals__bbIou;
+static PyObject *__pyx_n_s_iou_locals__len;
+static PyObject *__pyx_n_s_iou_locals__preproc;
+static PyObject *__pyx_n_s_iou_locals__rleIou;
+static PyObject *__pyx_n_s_isbox;
+static PyObject *__pyx_n_s_iscrowd;
+static PyObject *__pyx_n_s_isrle;
+static PyObject *__pyx_n_s_j;
+static PyObject *__pyx_n_s_len;
+static PyObject *__pyx_kp_s_list_input_can_be_bounding_box_N;
+static PyObject *__pyx_n_s_m;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_n_s_mask;
+static PyObject *__pyx_n_s_masks;
+static PyObject *__pyx_n_s_merge;
+static PyObject *__pyx_n_s_n;
+static PyObject *__pyx_kp_u_ndarray_is_not_C_contiguous;
+static PyObject *__pyx_kp_u_ndarray_is_not_Fortran_contiguou;
+static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_np_poly;
+static PyObject *__pyx_n_s_numpy;
+static PyObject *__pyx_kp_s_numpy_core_multiarray_failed_to;
+static PyObject *__pyx_kp_s_numpy_core_umath_failed_to_impor;
+static PyObject *__pyx_kp_s_numpy_ndarray_input_is_only_for;
+static PyObject *__pyx_n_s_obj;
+static PyObject *__pyx_n_s_objs;
+static PyObject *__pyx_n_s_order;
+static PyObject *__pyx_n_s_p;
+static PyObject *__pyx_n_s_poly;
+static PyObject *__pyx_n_s_preproc;
+static PyObject *__pyx_n_s_py_string;
+static PyObject *__pyx_n_s_pycocotools__mask;
+static PyObject *__pyx_n_s_pyiscrowd;
+static PyObject *__pyx_n_s_pyobj;
+static PyObject *__pyx_n_s_range;
+static PyObject *__pyx_n_s_reshape;
+static PyObject *__pyx_n_s_rleIou;
+static PyObject *__pyx_n_s_rleObjs;
+static PyObject *__pyx_n_s_shape;
+static PyObject *__pyx_n_s_size;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_n_s_toBbox;
+static PyObject *__pyx_n_s_toString;
+static PyObject *__pyx_n_s_tsungyi;
+static PyObject *__pyx_n_s_ucRles;
+static PyObject *__pyx_n_s_uint32;
+static PyObject *__pyx_n_s_uint8;
+static PyObject *__pyx_kp_u_unknown_dtype_code_in_numpy_pxd;
+static PyObject *__pyx_kp_s_unrecognized_type_The_following;
+static PyObject *__pyx_n_s_w;
+static PyObject *__pyx_n_s_zeros;
+static int __pyx_pf_11pycocotools_5_mask_4RLEs___cinit__(struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_self, siz __pyx_v_n); /* proto */
+static void __pyx_pf_11pycocotools_5_mask_4RLEs_2__dealloc__(struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_4RLEs_4__getattr__(struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_self, PyObject *__pyx_v_key); /* proto */
+static int __pyx_pf_11pycocotools_5_mask_5Masks___cinit__(struct __pyx_obj_11pycocotools_5_mask_Masks *__pyx_v_self, PyObject *__pyx_v_h, PyObject *__pyx_v_w, PyObject *__pyx_v_n); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_5Masks_2__array__(struct __pyx_obj_11pycocotools_5_mask_Masks *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask__toString(CYTHON_UNUSED PyObject *__pyx_self, struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_2_frString(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_4encode(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_mask); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_6decode(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_8merge(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs, int __pyx_v_intersect); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_10area(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_3iou__preproc(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_objs); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_3iou_2_rleIou(CYTHON_UNUSED PyObject *__pyx_self, struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_dt, struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_gt, PyArrayObject *__pyx_v_iscrowd, siz __pyx_v_m, siz __pyx_v_n, PyArrayObject *__pyx_v__iou); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_3iou_4_bbIou(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_dt, PyArrayObject *__pyx_v_gt, PyArrayObject *__pyx_v_iscrowd, siz __pyx_v_m, siz __pyx_v_n, PyArrayObject *__pyx_v__iou); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_3iou_6_len(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_12iou(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_dt, PyObject *__pyx_v_gt, PyObject *__pyx_v_pyiscrowd); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_14toBbox(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_16frBbox(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_bb, siz __pyx_v_h, siz __pyx_v_w); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_18frPoly(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_poly, siz __pyx_v_h, siz __pyx_v_w); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_20frUncompressedRLE(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_ucRles, CYTHON_UNUSED siz __pyx_v_h, CYTHON_UNUSED siz __pyx_v_w); /* proto */
+static PyObject *__pyx_pf_11pycocotools_5_mask_22frPyObjects(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_pyobj, siz __pyx_v_h, PyObject *__pyx_v_w); /* proto */
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info); /* proto */
+static PyObject *__pyx_tp_new_11pycocotools_5_mask_RLEs(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_tp_new_11pycocotools_5_mask_Masks(PyTypeObject *t, PyObject *a, PyObject *k); /*proto*/
+static PyObject *__pyx_int_0;
+static PyObject *__pyx_int_1;
+static PyObject *__pyx_int_4;
+static PyObject *__pyx_tuple_;
+static PyObject *__pyx_tuple__2;
+static PyObject *__pyx_tuple__3;
+static PyObject *__pyx_tuple__4;
+static PyObject *__pyx_tuple__5;
+static PyObject *__pyx_tuple__7;
+static PyObject *__pyx_tuple__9;
+static PyObject *__pyx_tuple__11;
+static PyObject *__pyx_tuple__13;
+static PyObject *__pyx_tuple__14;
+static PyObject *__pyx_tuple__15;
+static PyObject *__pyx_tuple__16;
+static PyObject *__pyx_tuple__17;
+static PyObject *__pyx_tuple__18;
+static PyObject *__pyx_tuple__19;
+static PyObject *__pyx_tuple__20;
+static PyObject *__pyx_tuple__21;
+static PyObject *__pyx_tuple__22;
+static PyObject *__pyx_tuple__23;
+static PyObject *__pyx_tuple__24;
+static PyObject *__pyx_tuple__25;
+static PyObject *__pyx_tuple__26;
+static PyObject *__pyx_tuple__28;
+static PyObject *__pyx_tuple__30;
+static PyObject *__pyx_tuple__32;
+static PyObject *__pyx_tuple__34;
+static PyObject *__pyx_tuple__36;
+static PyObject *__pyx_tuple__38;
+static PyObject *__pyx_tuple__40;
+static PyObject *__pyx_tuple__42;
+static PyObject *__pyx_tuple__44;
+static PyObject *__pyx_tuple__46;
+static PyObject *__pyx_tuple__48;
+static PyObject *__pyx_codeobj__6;
+static PyObject *__pyx_codeobj__8;
+static PyObject *__pyx_codeobj__10;
+static PyObject *__pyx_codeobj__12;
+static PyObject *__pyx_codeobj__27;
+static PyObject *__pyx_codeobj__29;
+static PyObject *__pyx_codeobj__31;
+static PyObject *__pyx_codeobj__33;
+static PyObject *__pyx_codeobj__35;
+static PyObject *__pyx_codeobj__37;
+static PyObject *__pyx_codeobj__39;
+static PyObject *__pyx_codeobj__41;
+static PyObject *__pyx_codeobj__43;
+static PyObject *__pyx_codeobj__45;
+static PyObject *__pyx_codeobj__47;
+static PyObject *__pyx_codeobj__49;
+
+/* "pycocotools/_mask.pyx":57
+ *     cdef siz _n
+ * 
+ *     def __cinit__(self, siz n =0):             # <<<<<<<<<<<<<<
+ *         rlesInit(&self._R, n)
+ *         self._n = n
+ */
+
+/* Python wrapper */
+static int __pyx_pw_11pycocotools_5_mask_4RLEs_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_pw_11pycocotools_5_mask_4RLEs_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  siz __pyx_v_n;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_n,0};
+    PyObject* values[1] = {0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (kw_args > 0) {
+          PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n);
+          if (value) { values[0] = value; kw_args--; }
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(0, 57, __pyx_L3_error)
+      }
+    } else {
+      switch (PyTuple_GET_SIZE(__pyx_args)) {
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    if (values[0]) {
+      __pyx_v_n = __Pyx_PyInt_As_siz(values[0]); if (unlikely((__pyx_v_n == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 57, __pyx_L3_error)
+    } else {
+      __pyx_v_n = ((siz)0);
+    }
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 0, 0, 1, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 57, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("pycocotools._mask.RLEs.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_4RLEs___cinit__(((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_v_self), __pyx_v_n);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_pf_11pycocotools_5_mask_4RLEs___cinit__(struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_self, siz __pyx_v_n) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__cinit__", 0);
+
+  /* "pycocotools/_mask.pyx":58
+ * 
+ *     def __cinit__(self, siz n =0):
+ *         rlesInit(&self._R, n)             # <<<<<<<<<<<<<<
+ *         self._n = n
+ * 
+ */
+  rlesInit((&__pyx_v_self->_R), __pyx_v_n);
+
+  /* "pycocotools/_mask.pyx":59
+ *     def __cinit__(self, siz n =0):
+ *         rlesInit(&self._R, n)
+ *         self._n = n             # <<<<<<<<<<<<<<
+ * 
+ *     # free the RLE array here
+ */
+  __pyx_v_self->_n = __pyx_v_n;
+
+  /* "pycocotools/_mask.pyx":57
+ *     cdef siz _n
+ * 
+ *     def __cinit__(self, siz n =0):             # <<<<<<<<<<<<<<
+ *         rlesInit(&self._R, n)
+ *         self._n = n
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":62
+ * 
+ *     # free the RLE array here
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         if self._R is not NULL:
+ *             for i in range(self._n):
+ */
+
+/* Python wrapper */
+static void __pyx_pw_11pycocotools_5_mask_4RLEs_3__dealloc__(PyObject *__pyx_v_self); /*proto*/
+static void __pyx_pw_11pycocotools_5_mask_4RLEs_3__dealloc__(PyObject *__pyx_v_self) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__dealloc__ (wrapper)", 0);
+  __pyx_pf_11pycocotools_5_mask_4RLEs_2__dealloc__(((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_pf_11pycocotools_5_mask_4RLEs_2__dealloc__(struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_self) {
+  siz __pyx_v_i;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  siz __pyx_t_2;
+  siz __pyx_t_3;
+  __Pyx_RefNannySetupContext("__dealloc__", 0);
+
+  /* "pycocotools/_mask.pyx":63
+ *     # free the RLE array here
+ *     def __dealloc__(self):
+ *         if self._R is not NULL:             # <<<<<<<<<<<<<<
+ *             for i in range(self._n):
+ *                 free(self._R[i].cnts)
+ */
+  __pyx_t_1 = ((__pyx_v_self->_R != NULL) != 0);
+  if (__pyx_t_1) {
+
+    /* "pycocotools/_mask.pyx":64
+ *     def __dealloc__(self):
+ *         if self._R is not NULL:
+ *             for i in range(self._n):             # <<<<<<<<<<<<<<
+ *                 free(self._R[i].cnts)
+ *             free(self._R)
+ */
+    __pyx_t_2 = __pyx_v_self->_n;
+    for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {
+      __pyx_v_i = __pyx_t_3;
+
+      /* "pycocotools/_mask.pyx":65
+ *         if self._R is not NULL:
+ *             for i in range(self._n):
+ *                 free(self._R[i].cnts)             # <<<<<<<<<<<<<<
+ *             free(self._R)
+ *     def __getattr__(self, key):
+ */
+      free((__pyx_v_self->_R[__pyx_v_i]).cnts);
+    }
+
+    /* "pycocotools/_mask.pyx":66
+ *             for i in range(self._n):
+ *                 free(self._R[i].cnts)
+ *             free(self._R)             # <<<<<<<<<<<<<<
+ *     def __getattr__(self, key):
+ *         if key == 'n':
+ */
+    free(__pyx_v_self->_R);
+
+    /* "pycocotools/_mask.pyx":63
+ *     # free the RLE array here
+ *     def __dealloc__(self):
+ *         if self._R is not NULL:             # <<<<<<<<<<<<<<
+ *             for i in range(self._n):
+ *                 free(self._R[i].cnts)
+ */
+  }
+
+  /* "pycocotools/_mask.pyx":62
+ * 
+ *     # free the RLE array here
+ *     def __dealloc__(self):             # <<<<<<<<<<<<<<
+ *         if self._R is not NULL:
+ *             for i in range(self._n):
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "pycocotools/_mask.pyx":67
+ *                 free(self._R[i].cnts)
+ *             free(self._R)
+ *     def __getattr__(self, key):             # <<<<<<<<<<<<<<
+ *         if key == 'n':
+ *             return self._n
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_4RLEs_5__getattr__(PyObject *__pyx_v_self, PyObject *__pyx_v_key); /*proto*/
+static PyObject *__pyx_pw_11pycocotools_5_mask_4RLEs_5__getattr__(PyObject *__pyx_v_self, PyObject *__pyx_v_key) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getattr__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_4RLEs_4__getattr__(((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_v_self), ((PyObject *)__pyx_v_key));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_4RLEs_4__getattr__(struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_self, PyObject *__pyx_v_key) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  __Pyx_RefNannySetupContext("__getattr__", 0);
+
+  /* "pycocotools/_mask.pyx":68
+ *             free(self._R)
+ *     def __getattr__(self, key):
+ *         if key == 'n':             # <<<<<<<<<<<<<<
+ *             return self._n
+ *         raise AttributeError(key)
+ */
+  __pyx_t_1 = (__Pyx_PyString_Equals(__pyx_v_key, __pyx_n_s_n, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) __PYX_ERR(0, 68, __pyx_L1_error)
+  if (__pyx_t_1) {
+
+    /* "pycocotools/_mask.pyx":69
+ *     def __getattr__(self, key):
+ *         if key == 'n':
+ *             return self._n             # <<<<<<<<<<<<<<
+ *         raise AttributeError(key)
+ * 
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_2 = __Pyx_PyInt_From_siz(__pyx_v_self->_n); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 69, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_r = __pyx_t_2;
+    __pyx_t_2 = 0;
+    goto __pyx_L0;
+
+    /* "pycocotools/_mask.pyx":68
+ *             free(self._R)
+ *     def __getattr__(self, key):
+ *         if key == 'n':             # <<<<<<<<<<<<<<
+ *             return self._n
+ *         raise AttributeError(key)
+ */
+  }
+
+  /* "pycocotools/_mask.pyx":70
+ *         if key == 'n':
+ *             return self._n
+ *         raise AttributeError(key)             # <<<<<<<<<<<<<<
+ * 
+ * # python class to wrap Mask array in C
+ */
+  __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 70, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_INCREF(__pyx_v_key);
+  __Pyx_GIVEREF(__pyx_v_key);
+  PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_v_key);
+  __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_AttributeError, __pyx_t_2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 70, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __PYX_ERR(0, 70, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":67
+ *                 free(self._R[i].cnts)
+ *             free(self._R)
+ *     def __getattr__(self, key):             # <<<<<<<<<<<<<<
+ *         if key == 'n':
+ *             return self._n
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("pycocotools._mask.RLEs.__getattr__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":80
+ *     cdef siz _n
+ * 
+ *     def __cinit__(self, h, w, n):             # <<<<<<<<<<<<<<
+ *         self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+ *         self._h = h
+ */
+
+/* Python wrapper */
+static int __pyx_pw_11pycocotools_5_mask_5Masks_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static int __pyx_pw_11pycocotools_5_mask_5Masks_1__cinit__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_h = 0;
+  PyObject *__pyx_v_w = 0;
+  PyObject *__pyx_v_n = 0;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__cinit__ (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_h,&__pyx_n_s_w,&__pyx_n_s_n,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, 1); __PYX_ERR(0, 80, __pyx_L3_error)
+        }
+        case  2:
+        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, 2); __PYX_ERR(0, 80, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__cinit__") < 0)) __PYX_ERR(0, 80, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_h = values[0];
+    __pyx_v_w = values[1];
+    __pyx_v_n = values[2];
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("__cinit__", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 80, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("pycocotools._mask.Masks.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_5Masks___cinit__(((struct __pyx_obj_11pycocotools_5_mask_Masks *)__pyx_v_self), __pyx_v_h, __pyx_v_w, __pyx_v_n);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_pf_11pycocotools_5_mask_5Masks___cinit__(struct __pyx_obj_11pycocotools_5_mask_Masks *__pyx_v_self, PyObject *__pyx_v_h, PyObject *__pyx_v_w, PyObject *__pyx_v_n) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  size_t __pyx_t_4;
+  siz __pyx_t_5;
+  __Pyx_RefNannySetupContext("__cinit__", 0);
+
+  /* "pycocotools/_mask.pyx":81
+ * 
+ *     def __cinit__(self, h, w, n):
+ *         self._mask = <byte*> malloc(h*w*n* sizeof(byte))             # <<<<<<<<<<<<<<
+ *         self._h = h
+ *         self._w = w
+ */
+  __pyx_t_1 = PyNumber_Multiply(__pyx_v_h, __pyx_v_w); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 81, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = PyNumber_Multiply(__pyx_t_1, __pyx_v_n); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 81, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_FromSize_t((sizeof(byte))); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 81, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = PyNumber_Multiply(__pyx_t_2, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 81, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_4 = __Pyx_PyInt_As_size_t(__pyx_t_3); if (unlikely((__pyx_t_4 == (size_t)-1) && PyErr_Occurred())) __PYX_ERR(0, 81, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_self->_mask = ((byte *)malloc(__pyx_t_4));
+
+  /* "pycocotools/_mask.pyx":82
+ *     def __cinit__(self, h, w, n):
+ *         self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+ *         self._h = h             # <<<<<<<<<<<<<<
+ *         self._w = w
+ *         self._n = n
+ */
+  __pyx_t_5 = __Pyx_PyInt_As_siz(__pyx_v_h); if (unlikely((__pyx_t_5 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 82, __pyx_L1_error)
+  __pyx_v_self->_h = __pyx_t_5;
+
+  /* "pycocotools/_mask.pyx":83
+ *         self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+ *         self._h = h
+ *         self._w = w             # <<<<<<<<<<<<<<
+ *         self._n = n
+ *     # def __dealloc__(self):
+ */
+  __pyx_t_5 = __Pyx_PyInt_As_siz(__pyx_v_w); if (unlikely((__pyx_t_5 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 83, __pyx_L1_error)
+  __pyx_v_self->_w = __pyx_t_5;
+
+  /* "pycocotools/_mask.pyx":84
+ *         self._h = h
+ *         self._w = w
+ *         self._n = n             # <<<<<<<<<<<<<<
+ *     # def __dealloc__(self):
+ *         # the memory management of _mask has been passed to np.ndarray
+ */
+  __pyx_t_5 = __Pyx_PyInt_As_siz(__pyx_v_n); if (unlikely((__pyx_t_5 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 84, __pyx_L1_error)
+  __pyx_v_self->_n = __pyx_t_5;
+
+  /* "pycocotools/_mask.pyx":80
+ *     cdef siz _n
+ * 
+ *     def __cinit__(self, h, w, n):             # <<<<<<<<<<<<<<
+ *         self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+ *         self._h = h
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_AddTraceback("pycocotools._mask.Masks.__cinit__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":90
+ * 
+ *     # called when passing into np.array() and return an np.ndarray in column-major order
+ *     def __array__(self):             # <<<<<<<<<<<<<<
+ *         cdef np.npy_intp shape[1]
+ *         shape[0] = <np.npy_intp> self._h*self._w*self._n
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_5Masks_3__array__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused); /*proto*/
+static PyObject *__pyx_pw_11pycocotools_5_mask_5Masks_3__array__(PyObject *__pyx_v_self, CYTHON_UNUSED PyObject *unused) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__array__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_5Masks_2__array__(((struct __pyx_obj_11pycocotools_5_mask_Masks *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_5Masks_2__array__(struct __pyx_obj_11pycocotools_5_mask_Masks *__pyx_v_self) {
+  npy_intp __pyx_v_shape[1];
+  PyObject *__pyx_v_ndarray = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  __Pyx_RefNannySetupContext("__array__", 0);
+
+  /* "pycocotools/_mask.pyx":92
+ *     def __array__(self):
+ *         cdef np.npy_intp shape[1]
+ *         shape[0] = <np.npy_intp> self._h*self._w*self._n             # <<<<<<<<<<<<<<
+ *         # Create a 1D array, and reshape it to fortran/Matlab column-major array
+ *         ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')
+ */
+  (__pyx_v_shape[0]) = ((((npy_intp)__pyx_v_self->_h) * __pyx_v_self->_w) * __pyx_v_self->_n);
+
+  /* "pycocotools/_mask.pyx":94
+ *         shape[0] = <np.npy_intp> self._h*self._w*self._n
+ *         # Create a 1D array, and reshape it to fortran/Matlab column-major array
+ *         ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')             # <<<<<<<<<<<<<<
+ *         # The _mask allocated by Masks is now handled by ndarray
+ *         PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)
+ */
+  __pyx_t_1 = PyArray_SimpleNewFromData(1, __pyx_v_shape, NPY_UINT8, __pyx_v_self->_mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 94, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_reshape); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 94, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_siz(__pyx_v_self->_h); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 94, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = __Pyx_PyInt_From_siz(__pyx_v_self->_w); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 94, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_4 = __Pyx_PyInt_From_siz(__pyx_v_self->_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 94, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = PyTuple_New(3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 94, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_3);
+  PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_5, 2, __pyx_t_4);
+  __pyx_t_1 = 0;
+  __pyx_t_3 = 0;
+  __pyx_t_4 = 0;
+  __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 94, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_5);
+  PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_5);
+  __pyx_t_5 = 0;
+  __pyx_t_5 = PyDict_New(); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 94, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  if (PyDict_SetItem(__pyx_t_5, __pyx_n_s_order, __pyx_n_s_F) < 0) __PYX_ERR(0, 94, __pyx_L1_error)
+  __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 94, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_v_ndarray = __pyx_t_3;
+  __pyx_t_3 = 0;
+
+  /* "pycocotools/_mask.pyx":96
+ *         ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')
+ *         # The _mask allocated by Masks is now handled by ndarray
+ *         PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)             # <<<<<<<<<<<<<<
+ *         return ndarray
+ * 
+ */
+  if (!(likely(((__pyx_v_ndarray) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_ndarray, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 96, __pyx_L1_error)
+  PyArray_ENABLEFLAGS(((PyArrayObject *)__pyx_v_ndarray), NPY_OWNDATA);
+
+  /* "pycocotools/_mask.pyx":97
+ *         # The _mask allocated by Masks is now handled by ndarray
+ *         PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)
+ *         return ndarray             # <<<<<<<<<<<<<<
+ * 
+ * # internal conversion from Python RLEs object to compressed RLE format
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_ndarray);
+  __pyx_r = __pyx_v_ndarray;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":90
+ * 
+ *     # called when passing into np.array() and return an np.ndarray in column-major order
+ *     def __array__(self):             # <<<<<<<<<<<<<<
+ *         cdef np.npy_intp shape[1]
+ *         shape[0] = <np.npy_intp> self._h*self._w*self._n
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("pycocotools._mask.Masks.__array__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_ndarray);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":100
+ * 
+ * # internal conversion from Python RLEs object to compressed RLE format
+ * def _toString(RLEs Rs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = Rs.n
+ *     cdef bytes py_string
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_1_toString(PyObject *__pyx_self, PyObject *__pyx_v_Rs); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_1_toString = {"_toString", (PyCFunction)__pyx_pw_11pycocotools_5_mask_1_toString, METH_O, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_1_toString(PyObject *__pyx_self, PyObject *__pyx_v_Rs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_toString (wrapper)", 0);
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_Rs), __pyx_ptype_11pycocotools_5_mask_RLEs, 1, "Rs", 0))) __PYX_ERR(0, 100, __pyx_L1_error)
+  __pyx_r = __pyx_pf_11pycocotools_5_mask__toString(__pyx_self, ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_v_Rs));
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask__toString(CYTHON_UNUSED PyObject *__pyx_self, struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs) {
+  siz __pyx_v_n;
+  PyObject *__pyx_v_py_string = 0;
+  char *__pyx_v_c_string;
+  PyObject *__pyx_v_objs = NULL;
+  siz __pyx_v_i;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  siz __pyx_t_2;
+  siz __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  int __pyx_t_7;
+  __Pyx_RefNannySetupContext("_toString", 0);
+
+  /* "pycocotools/_mask.pyx":101
+ * # internal conversion from Python RLEs object to compressed RLE format
+ * def _toString(RLEs Rs):
+ *     cdef siz n = Rs.n             # <<<<<<<<<<<<<<
+ *     cdef bytes py_string
+ *     cdef char* c_string
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_Rs), __pyx_n_s_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 101, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_siz(__pyx_t_1); if (unlikely((__pyx_t_2 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 101, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_n = __pyx_t_2;
+
+  /* "pycocotools/_mask.pyx":104
+ *     cdef bytes py_string
+ *     cdef char* c_string
+ *     objs = []             # <<<<<<<<<<<<<<
+ *     for i in range(n):
+ *         c_string = rleToString( <RLE*> &Rs._R[i] )
+ */
+  __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 104, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_objs = ((PyObject*)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":105
+ *     cdef char* c_string
+ *     objs = []
+ *     for i in range(n):             # <<<<<<<<<<<<<<
+ *         c_string = rleToString( <RLE*> &Rs._R[i] )
+ *         py_string = c_string
+ */
+  __pyx_t_2 = __pyx_v_n;
+  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {
+    __pyx_v_i = __pyx_t_3;
+
+    /* "pycocotools/_mask.pyx":106
+ *     objs = []
+ *     for i in range(n):
+ *         c_string = rleToString( <RLE*> &Rs._R[i] )             # <<<<<<<<<<<<<<
+ *         py_string = c_string
+ *         objs.append({
+ */
+    __pyx_v_c_string = rleToString(((RLE *)(&(__pyx_v_Rs->_R[__pyx_v_i]))));
+
+    /* "pycocotools/_mask.pyx":107
+ *     for i in range(n):
+ *         c_string = rleToString( <RLE*> &Rs._R[i] )
+ *         py_string = c_string             # <<<<<<<<<<<<<<
+ *         objs.append({
+ *             'size': [Rs._R[i].h, Rs._R[i].w],
+ */
+    __pyx_t_1 = __Pyx_PyBytes_FromString(__pyx_v_c_string); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 107, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __Pyx_XDECREF_SET(__pyx_v_py_string, ((PyObject*)__pyx_t_1));
+    __pyx_t_1 = 0;
+
+    /* "pycocotools/_mask.pyx":109
+ *         py_string = c_string
+ *         objs.append({
+ *             'size': [Rs._R[i].h, Rs._R[i].w],             # <<<<<<<<<<<<<<
+ *             'counts': py_string
+ *         })
+ */
+    __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 109, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_4 = __Pyx_PyInt_From_siz((__pyx_v_Rs->_R[__pyx_v_i]).h); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 109, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_5 = __Pyx_PyInt_From_siz((__pyx_v_Rs->_R[__pyx_v_i]).w); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 109, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = PyList_New(2); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 109, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_GIVEREF(__pyx_t_4);
+    PyList_SET_ITEM(__pyx_t_6, 0, __pyx_t_4);
+    __Pyx_GIVEREF(__pyx_t_5);
+    PyList_SET_ITEM(__pyx_t_6, 1, __pyx_t_5);
+    __pyx_t_4 = 0;
+    __pyx_t_5 = 0;
+    if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_size, __pyx_t_6) < 0) __PYX_ERR(0, 109, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+
+    /* "pycocotools/_mask.pyx":111
+ *             'size': [Rs._R[i].h, Rs._R[i].w],
+ *             'counts': py_string
+ *         })             # <<<<<<<<<<<<<<
+ *         free(c_string)
+ *     return objs
+ */
+    if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_counts, __pyx_v_py_string) < 0) __PYX_ERR(0, 109, __pyx_L1_error)
+
+    /* "pycocotools/_mask.pyx":108
+ *         c_string = rleToString( <RLE*> &Rs._R[i] )
+ *         py_string = c_string
+ *         objs.append({             # <<<<<<<<<<<<<<
+ *             'size': [Rs._R[i].h, Rs._R[i].w],
+ *             'counts': py_string
+ */
+    __pyx_t_7 = __Pyx_PyList_Append(__pyx_v_objs, __pyx_t_1); if (unlikely(__pyx_t_7 == -1)) __PYX_ERR(0, 108, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+    /* "pycocotools/_mask.pyx":112
+ *             'counts': py_string
+ *         })
+ *         free(c_string)             # <<<<<<<<<<<<<<
+ *     return objs
+ * 
+ */
+    free(__pyx_v_c_string);
+  }
+
+  /* "pycocotools/_mask.pyx":113
+ *         })
+ *         free(c_string)
+ *     return objs             # <<<<<<<<<<<<<<
+ * 
+ * # internal conversion from compressed RLE format to Python RLEs object
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":100
+ * 
+ * # internal conversion from Python RLEs object to compressed RLE format
+ * def _toString(RLEs Rs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = Rs.n
+ *     cdef bytes py_string
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("pycocotools._mask._toString", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_py_string);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":116
+ * 
+ * # internal conversion from compressed RLE format to Python RLEs object
+ * def _frString(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = len(rleObjs)
+ *     Rs = RLEs(n)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_3_frString(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_3_frString = {"_frString", (PyCFunction)__pyx_pw_11pycocotools_5_mask_3_frString, METH_O, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_3_frString(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_frString (wrapper)", 0);
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_2_frString(__pyx_self, ((PyObject *)__pyx_v_rleObjs));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_2_frString(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  siz __pyx_v_n;
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs = NULL;
+  PyObject *__pyx_v_py_string = 0;
+  char *__pyx_v_c_string;
+  PyObject *__pyx_v_i = NULL;
+  PyObject *__pyx_v_obj = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *(*__pyx_t_4)(PyObject *);
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  char *__pyx_t_7;
+  Py_ssize_t __pyx_t_8;
+  siz __pyx_t_9;
+  siz __pyx_t_10;
+  __Pyx_RefNannySetupContext("_frString", 0);
+
+  /* "pycocotools/_mask.pyx":117
+ * # internal conversion from compressed RLE format to Python RLEs object
+ * def _frString(rleObjs):
+ *     cdef siz n = len(rleObjs)             # <<<<<<<<<<<<<<
+ *     Rs = RLEs(n)
+ *     cdef bytes py_string
+ */
+  __pyx_t_1 = PyObject_Length(__pyx_v_rleObjs); if (unlikely(__pyx_t_1 == -1)) __PYX_ERR(0, 117, __pyx_L1_error)
+  __pyx_v_n = __pyx_t_1;
+
+  /* "pycocotools/_mask.pyx":118
+ * def _frString(rleObjs):
+ *     cdef siz n = len(rleObjs)
+ *     Rs = RLEs(n)             # <<<<<<<<<<<<<<
+ *     cdef bytes py_string
+ *     cdef char* c_string
+ */
+  __pyx_t_2 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 118, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 118, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_2);
+  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_2);
+  __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_11pycocotools_5_mask_RLEs), __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 118, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_Rs = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "pycocotools/_mask.pyx":121
+ *     cdef bytes py_string
+ *     cdef char* c_string
+ *     for i, obj in enumerate(rleObjs):             # <<<<<<<<<<<<<<
+ *         py_string = str(obj['counts'])
+ *         c_string = py_string
+ */
+  __Pyx_INCREF(__pyx_int_0);
+  __pyx_t_2 = __pyx_int_0;
+  if (likely(PyList_CheckExact(__pyx_v_rleObjs)) || PyTuple_CheckExact(__pyx_v_rleObjs)) {
+    __pyx_t_3 = __pyx_v_rleObjs; __Pyx_INCREF(__pyx_t_3); __pyx_t_1 = 0;
+    __pyx_t_4 = NULL;
+  } else {
+    __pyx_t_1 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_rleObjs); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 121, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 121, __pyx_L1_error)
+  }
+  for (;;) {
+    if (likely(!__pyx_t_4)) {
+      if (likely(PyList_CheckExact(__pyx_t_3))) {
+        if (__pyx_t_1 >= PyList_GET_SIZE(__pyx_t_3)) break;
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 121, __pyx_L1_error)
+        #else
+        __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 121, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      } else {
+        if (__pyx_t_1 >= PyTuple_GET_SIZE(__pyx_t_3)) break;
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 121, __pyx_L1_error)
+        #else
+        __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 121, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      }
+    } else {
+      __pyx_t_5 = __pyx_t_4(__pyx_t_3);
+      if (unlikely(!__pyx_t_5)) {
+        PyObject* exc_type = PyErr_Occurred();
+        if (exc_type) {
+          if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+          else __PYX_ERR(0, 121, __pyx_L1_error)
+        }
+        break;
+      }
+      __Pyx_GOTREF(__pyx_t_5);
+    }
+    __Pyx_XDECREF_SET(__pyx_v_obj, __pyx_t_5);
+    __pyx_t_5 = 0;
+    __Pyx_INCREF(__pyx_t_2);
+    __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_2);
+    __pyx_t_5 = __Pyx_PyInt_AddObjC(__pyx_t_2, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 121, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_2);
+    __pyx_t_2 = __pyx_t_5;
+    __pyx_t_5 = 0;
+
+    /* "pycocotools/_mask.pyx":122
+ *     cdef char* c_string
+ *     for i, obj in enumerate(rleObjs):
+ *         py_string = str(obj['counts'])             # <<<<<<<<<<<<<<
+ *         c_string = py_string
+ *         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
+ */
+    __pyx_t_5 = PyObject_GetItem(__pyx_v_obj, __pyx_n_s_counts); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 122, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = PyTuple_New(1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 122, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_GIVEREF(__pyx_t_5);
+    PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_5);
+    __pyx_t_5 = 0;
+    __pyx_t_5 = __Pyx_PyObject_Call(((PyObject *)(&PyString_Type)), __pyx_t_6, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 122, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    if (!(likely(PyBytes_CheckExact(__pyx_t_5))||((__pyx_t_5) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "bytes", Py_TYPE(__pyx_t_5)->tp_name), 0))) __PYX_ERR(0, 122, __pyx_L1_error)
+    __Pyx_XDECREF_SET(__pyx_v_py_string, ((PyObject*)__pyx_t_5));
+    __pyx_t_5 = 0;
+
+    /* "pycocotools/_mask.pyx":123
+ *     for i, obj in enumerate(rleObjs):
+ *         py_string = str(obj['counts'])
+ *         c_string = py_string             # <<<<<<<<<<<<<<
+ *         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
+ *     return Rs
+ */
+    __pyx_t_7 = __Pyx_PyObject_AsString(__pyx_v_py_string); if (unlikely((!__pyx_t_7) && PyErr_Occurred())) __PYX_ERR(0, 123, __pyx_L1_error)
+    __pyx_v_c_string = __pyx_t_7;
+
+    /* "pycocotools/_mask.pyx":124
+ *         py_string = str(obj['counts'])
+ *         c_string = py_string
+ *         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )             # <<<<<<<<<<<<<<
+ *     return Rs
+ * 
+ */
+    __pyx_t_8 = __Pyx_PyIndex_AsSsize_t(__pyx_v_i); if (unlikely((__pyx_t_8 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(0, 124, __pyx_L1_error)
+    __pyx_t_5 = PyObject_GetItem(__pyx_v_obj, __pyx_n_s_size); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 124, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = __Pyx_GetItemInt(__pyx_t_5, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 124, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_9 = __Pyx_PyInt_As_siz(__pyx_t_6); if (unlikely((__pyx_t_9 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 124, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __pyx_t_6 = PyObject_GetItem(__pyx_v_obj, __pyx_n_s_size); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 124, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __pyx_t_5 = __Pyx_GetItemInt(__pyx_t_6, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 124, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __pyx_t_10 = __Pyx_PyInt_As_siz(__pyx_t_5); if (unlikely((__pyx_t_10 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 124, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    rleFrString(((RLE *)(&(__pyx_v_Rs->_R[__pyx_t_8]))), ((char *)__pyx_v_c_string), __pyx_t_9, __pyx_t_10);
+
+    /* "pycocotools/_mask.pyx":121
+ *     cdef bytes py_string
+ *     cdef char* c_string
+ *     for i, obj in enumerate(rleObjs):             # <<<<<<<<<<<<<<
+ *         py_string = str(obj['counts'])
+ *         c_string = py_string
+ */
+  }
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "pycocotools/_mask.pyx":125
+ *         c_string = py_string
+ *         rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
+ *     return Rs             # <<<<<<<<<<<<<<
+ * 
+ * # encode mask to RLEs objects
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(((PyObject *)__pyx_v_Rs));
+  __pyx_r = ((PyObject *)__pyx_v_Rs);
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":116
+ * 
+ * # internal conversion from compressed RLE format to Python RLEs object
+ * def _frString(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = len(rleObjs)
+ *     Rs = RLEs(n)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("pycocotools._mask._frString", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_py_string);
+  __Pyx_XDECREF(__pyx_v_i);
+  __Pyx_XDECREF(__pyx_v_obj);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":129
+ * # encode mask to RLEs objects
+ * # list of RLE string can be generated by RLEs member function
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):             # <<<<<<<<<<<<<<
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_5encode(PyObject *__pyx_self, PyObject *__pyx_v_mask); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_5encode = {"encode", (PyCFunction)__pyx_pw_11pycocotools_5_mask_5encode, METH_O, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_5encode(PyObject *__pyx_self, PyObject *__pyx_v_mask) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("encode (wrapper)", 0);
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_mask), __pyx_ptype_5numpy_ndarray, 1, "mask", 0))) __PYX_ERR(0, 129, __pyx_L1_error)
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_4encode(__pyx_self, ((PyArrayObject *)__pyx_v_mask));
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_4encode(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_mask) {
+  npy_intp __pyx_v_h;
+  npy_intp __pyx_v_w;
+  npy_intp __pyx_v_n;
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs = 0;
+  PyObject *__pyx_v_objs = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_mask;
+  __Pyx_Buffer __pyx_pybuffer_mask;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  npy_intp __pyx_t_1;
+  npy_intp __pyx_t_2;
+  npy_intp __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  __Pyx_RefNannySetupContext("encode", 0);
+  __pyx_pybuffer_mask.pybuffer.buf = NULL;
+  __pyx_pybuffer_mask.refcount = 0;
+  __pyx_pybuffernd_mask.data = NULL;
+  __pyx_pybuffernd_mask.rcbuffer = &__pyx_pybuffer_mask;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_mask.rcbuffer->pybuffer, (PyObject*)__pyx_v_mask, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint8_t, PyBUF_FORMAT| PyBUF_F_CONTIGUOUS, 3, 0, __pyx_stack) == -1)) __PYX_ERR(0, 129, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_mask.diminfo[0].strides = __pyx_pybuffernd_mask.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_mask.diminfo[0].shape = __pyx_pybuffernd_mask.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_mask.diminfo[1].strides = __pyx_pybuffernd_mask.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_mask.diminfo[1].shape = __pyx_pybuffernd_mask.rcbuffer->pybuffer.shape[1]; __pyx_pybuffernd_mask.diminfo[2].strides = __pyx_pybuffernd_mask.rcbuffer->pybuffer.strides[2]; __pyx_pybuffernd_mask.diminfo[2].shape = __pyx_pybuffernd_mask.rcbuffer->pybuffer.shape[2];
+
+  /* "pycocotools/_mask.pyx":130
+ * # list of RLE string can be generated by RLEs member function
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = RLEs(n)
+ *     rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+ */
+  __pyx_t_1 = (__pyx_v_mask->dimensions[0]);
+  __pyx_t_2 = (__pyx_v_mask->dimensions[1]);
+  __pyx_t_3 = (__pyx_v_mask->dimensions[2]);
+  __pyx_v_h = __pyx_t_1;
+  __pyx_v_w = __pyx_t_2;
+  __pyx_v_n = __pyx_t_3;
+
+  /* "pycocotools/_mask.pyx":131
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)             # <<<<<<<<<<<<<<
+ *     rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+ *     objs = _toString(Rs)
+ */
+  __pyx_t_4 = __Pyx_PyInt_From_Py_intptr_t(__pyx_v_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 131, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = PyTuple_New(1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 131, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_11pycocotools_5_mask_RLEs), __pyx_t_5, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 131, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_v_Rs = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_t_4);
+  __pyx_t_4 = 0;
+
+  /* "pycocotools/_mask.pyx":132
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)
+ *     rleEncode(Rs._R,<byte*>mask.data,h,w,n)             # <<<<<<<<<<<<<<
+ *     objs = _toString(Rs)
+ *     return objs
+ */
+  rleEncode(__pyx_v_Rs->_R, ((byte *)__pyx_v_mask->data), __pyx_v_h, __pyx_v_w, __pyx_v_n);
+
+  /* "pycocotools/_mask.pyx":133
+ *     cdef RLEs Rs = RLEs(n)
+ *     rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+ *     objs = _toString(Rs)             # <<<<<<<<<<<<<<
+ *     return objs
+ * 
+ */
+  __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_toString); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 133, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_6 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_5))) {
+    __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_5);
+    if (likely(__pyx_t_6)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+      __Pyx_INCREF(__pyx_t_6);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_5, function);
+    }
+  }
+  if (!__pyx_t_6) {
+    __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_5, ((PyObject *)__pyx_v_Rs)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 133, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_5)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_6, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 133, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_4);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_6, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 133, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_4);
+    } else
+    #endif
+    {
+      __pyx_t_7 = PyTuple_New(1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 133, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_6); __pyx_t_6 = NULL;
+      __Pyx_INCREF(((PyObject *)__pyx_v_Rs));
+      __Pyx_GIVEREF(((PyObject *)__pyx_v_Rs));
+      PyTuple_SET_ITEM(__pyx_t_7, 0+1, ((PyObject *)__pyx_v_Rs));
+      __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_7, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 133, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_v_objs = __pyx_t_4;
+  __pyx_t_4 = 0;
+
+  /* "pycocotools/_mask.pyx":134
+ *     rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+ *     objs = _toString(Rs)
+ *     return objs             # <<<<<<<<<<<<<<
+ * 
+ * # decode mask from compressed list of RLE string or RLEs object
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":129
+ * # encode mask to RLEs objects
+ * # list of RLE string can be generated by RLEs member function
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):             # <<<<<<<<<<<<<<
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_mask.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("pycocotools._mask.encode", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_mask.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":137
+ * 
+ * # decode mask from compressed list of RLE string or RLEs object
+ * def decode(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_7decode(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_7decode = {"decode", (PyCFunction)__pyx_pw_11pycocotools_5_mask_7decode, METH_O, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_7decode(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("decode (wrapper)", 0);
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_6decode(__pyx_self, ((PyObject *)__pyx_v_rleObjs));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_6decode(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs = 0;
+  siz __pyx_v_h;
+  siz __pyx_v_w;
+  siz __pyx_v_n;
+  struct __pyx_obj_11pycocotools_5_mask_Masks *__pyx_v_masks = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  siz __pyx_t_5;
+  siz __pyx_t_6;
+  siz __pyx_t_7;
+  __Pyx_RefNannySetupContext("decode", 0);
+
+  /* "pycocotools/_mask.pyx":138
+ * # decode mask from compressed list of RLE string or RLEs object
+ * def decode(rleObjs):
+ *     cdef RLEs Rs = _frString(rleObjs)             # <<<<<<<<<<<<<<
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ *     masks = Masks(h, w, n)
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_frString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 138, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_rleObjs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 138, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 138, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 138, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 138, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(__pyx_v_rleObjs);
+      __Pyx_GIVEREF(__pyx_v_rleObjs);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_rleObjs);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 138, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_11pycocotools_5_mask_RLEs))))) __PYX_ERR(0, 138, __pyx_L1_error)
+  __pyx_v_Rs = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":139
+ * def decode(rleObjs):
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n             # <<<<<<<<<<<<<<
+ *     masks = Masks(h, w, n)
+ *     rleDecode( <RLE*>Rs._R, masks._mask, n );
+ */
+  __pyx_t_5 = (__pyx_v_Rs->_R[0]).h;
+  __pyx_t_6 = (__pyx_v_Rs->_R[0]).w;
+  __pyx_t_7 = __pyx_v_Rs->_n;
+  __pyx_v_h = __pyx_t_5;
+  __pyx_v_w = __pyx_t_6;
+  __pyx_v_n = __pyx_t_7;
+
+  /* "pycocotools/_mask.pyx":140
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ *     masks = Masks(h, w, n)             # <<<<<<<<<<<<<<
+ *     rleDecode( <RLE*>Rs._R, masks._mask, n );
+ *     return np.array(masks)
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_siz(__pyx_v_h); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 140, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_From_siz(__pyx_v_w); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 140, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 140, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_3 = PyTuple_New(3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 140, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_2);
+  PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_3, 2, __pyx_t_4);
+  __pyx_t_1 = 0;
+  __pyx_t_2 = 0;
+  __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_11pycocotools_5_mask_Masks), __pyx_t_3, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 140, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_masks = ((struct __pyx_obj_11pycocotools_5_mask_Masks *)__pyx_t_4);
+  __pyx_t_4 = 0;
+
+  /* "pycocotools/_mask.pyx":141
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ *     masks = Masks(h, w, n)
+ *     rleDecode( <RLE*>Rs._R, masks._mask, n );             # <<<<<<<<<<<<<<
+ *     return np.array(masks)
+ * 
+ */
+  rleDecode(((RLE *)__pyx_v_Rs->_R), __pyx_v_masks->_mask, __pyx_v_n);
+
+  /* "pycocotools/_mask.pyx":142
+ *     masks = Masks(h, w, n)
+ *     rleDecode( <RLE*>Rs._R, masks._mask, n );
+ *     return np.array(masks)             # <<<<<<<<<<<<<<
+ * 
+ * def merge(rleObjs, bint intersect=0):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 142, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_array); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 142, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_2, ((PyObject *)__pyx_v_masks)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 142, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, ((PyObject *)__pyx_v_masks)};
+      __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 142, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_4);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, ((PyObject *)__pyx_v_masks)};
+      __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 142, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_4);
+    } else
+    #endif
+    {
+      __pyx_t_1 = PyTuple_New(1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 142, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(((PyObject *)__pyx_v_masks));
+      __Pyx_GIVEREF(((PyObject *)__pyx_v_masks));
+      PyTuple_SET_ITEM(__pyx_t_1, 0+1, ((PyObject *)__pyx_v_masks));
+      __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 142, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_4;
+  __pyx_t_4 = 0;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":137
+ * 
+ * # decode mask from compressed list of RLE string or RLEs object
+ * def decode(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("pycocotools._mask.decode", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF((PyObject *)__pyx_v_masks);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":144
+ *     return np.array(masks)
+ * 
+ * def merge(rleObjs, bint intersect=0):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_9merge(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_9merge = {"merge", (PyCFunction)__pyx_pw_11pycocotools_5_mask_9merge, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_9merge(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_rleObjs = 0;
+  int __pyx_v_intersect;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("merge (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_rleObjs,&__pyx_n_s_intersect,0};
+    PyObject* values[2] = {0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_rleObjs)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (kw_args > 0) {
+          PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_intersect);
+          if (value) { values[1] = value; kw_args--; }
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "merge") < 0)) __PYX_ERR(0, 144, __pyx_L3_error)
+      }
+    } else {
+      switch (PyTuple_GET_SIZE(__pyx_args)) {
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    __pyx_v_rleObjs = values[0];
+    if (values[1]) {
+      __pyx_v_intersect = __Pyx_PyObject_IsTrue(values[1]); if (unlikely((__pyx_v_intersect == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 144, __pyx_L3_error)
+    } else {
+      __pyx_v_intersect = ((int)0);
+    }
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("merge", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 144, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("pycocotools._mask.merge", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_8merge(__pyx_self, __pyx_v_rleObjs, __pyx_v_intersect);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_8merge(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs, int __pyx_v_intersect) {
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs = 0;
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_R = 0;
+  PyObject *__pyx_v_obj = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  __Pyx_RefNannySetupContext("merge", 0);
+
+  /* "pycocotools/_mask.pyx":145
+ * 
+ * def merge(rleObjs, bint intersect=0):
+ *     cdef RLEs Rs = _frString(rleObjs)             # <<<<<<<<<<<<<<
+ *     cdef RLEs R = RLEs(1)
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_frString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 145, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_rleObjs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 145, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 145, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 145, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 145, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(__pyx_v_rleObjs);
+      __Pyx_GIVEREF(__pyx_v_rleObjs);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_rleObjs);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 145, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_11pycocotools_5_mask_RLEs))))) __PYX_ERR(0, 145, __pyx_L1_error)
+  __pyx_v_Rs = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":146
+ * def merge(rleObjs, bint intersect=0):
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)             # <<<<<<<<<<<<<<
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+ *     obj = _toString(R)[0]
+ */
+  __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_11pycocotools_5_mask_RLEs), __pyx_tuple_, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 146, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v_R = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":147
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)             # <<<<<<<<<<<<<<
+ *     obj = _toString(R)[0]
+ *     return obj
+ */
+  rleMerge(((RLE *)__pyx_v_Rs->_R), ((RLE *)__pyx_v_R->_R), ((siz)__pyx_v_Rs->_n), __pyx_v_intersect);
+
+  /* "pycocotools/_mask.pyx":148
+ *     cdef RLEs R = RLEs(1)
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+ *     obj = _toString(R)[0]             # <<<<<<<<<<<<<<
+ *     return obj
+ * 
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_toString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 148, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_4) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, ((PyObject *)__pyx_v_R)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 148, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, ((PyObject *)__pyx_v_R)};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 148, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, ((PyObject *)__pyx_v_R)};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 148, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_3 = PyTuple_New(1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 148, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      __Pyx_INCREF(((PyObject *)__pyx_v_R));
+      __Pyx_GIVEREF(((PyObject *)__pyx_v_R));
+      PyTuple_SET_ITEM(__pyx_t_3, 0+1, ((PyObject *)__pyx_v_R));
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_3, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 148, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 148, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_obj = __pyx_t_2;
+  __pyx_t_2 = 0;
+
+  /* "pycocotools/_mask.pyx":149
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+ *     obj = _toString(R)[0]
+ *     return obj             # <<<<<<<<<<<<<<
+ * 
+ * def area(rleObjs):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_obj);
+  __pyx_r = __pyx_v_obj;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":144
+ *     return np.array(masks)
+ * 
+ * def merge(rleObjs, bint intersect=0):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("pycocotools._mask.merge", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF((PyObject *)__pyx_v_R);
+  __Pyx_XDECREF(__pyx_v_obj);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":151
+ *     return obj
+ * 
+ * def area(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_11area(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_11area = {"area", (PyCFunction)__pyx_pw_11pycocotools_5_mask_11area, METH_O, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_11area(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("area (wrapper)", 0);
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_10area(__pyx_self, ((PyObject *)__pyx_v_rleObjs));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_10area(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs = 0;
+  uint *__pyx_v__a;
+  npy_intp __pyx_v_shape[1];
+  PyObject *__pyx_v_a = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  __Pyx_RefNannySetupContext("area", 0);
+
+  /* "pycocotools/_mask.pyx":152
+ * 
+ * def area(rleObjs):
+ *     cdef RLEs Rs = _frString(rleObjs)             # <<<<<<<<<<<<<<
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ *     rleArea(Rs._R, Rs._n, _a)
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_frString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 152, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_rleObjs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 152, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 152, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 152, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 152, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(__pyx_v_rleObjs);
+      __Pyx_GIVEREF(__pyx_v_rleObjs);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_rleObjs);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 152, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_11pycocotools_5_mask_RLEs))))) __PYX_ERR(0, 152, __pyx_L1_error)
+  __pyx_v_Rs = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":153
+ * def area(rleObjs):
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))             # <<<<<<<<<<<<<<
+ *     rleArea(Rs._R, Rs._n, _a)
+ *     cdef np.npy_intp shape[1]
+ */
+  __pyx_v__a = ((uint *)malloc((__pyx_v_Rs->_n * (sizeof(unsigned int)))));
+
+  /* "pycocotools/_mask.pyx":154
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ *     rleArea(Rs._R, Rs._n, _a)             # <<<<<<<<<<<<<<
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> Rs._n
+ */
+  rleArea(__pyx_v_Rs->_R, __pyx_v_Rs->_n, __pyx_v__a);
+
+  /* "pycocotools/_mask.pyx":156
+ *     rleArea(Rs._R, Rs._n, _a)
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> Rs._n             # <<<<<<<<<<<<<<
+ *     a = np.array((Rs._n, ), dtype=np.uint8)
+ *     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+ */
+  (__pyx_v_shape[0]) = ((npy_intp)__pyx_v_Rs->_n);
+
+  /* "pycocotools/_mask.pyx":157
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> Rs._n
+ *     a = np.array((Rs._n, ), dtype=np.uint8)             # <<<<<<<<<<<<<<
+ *     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+ *     PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 157, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_array); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 157, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_siz(__pyx_v_Rs->_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 157, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 157, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_1);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 157, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __pyx_t_4 = PyDict_New(); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 157, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 157, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_uint8); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 157, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (PyDict_SetItem(__pyx_t_4, __pyx_n_s_dtype, __pyx_t_5) < 0) __PYX_ERR(0, 157, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 157, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_v_a = __pyx_t_5;
+  __pyx_t_5 = 0;
+
+  /* "pycocotools/_mask.pyx":158
+ *     shape[0] = <np.npy_intp> Rs._n
+ *     a = np.array((Rs._n, ), dtype=np.uint8)
+ *     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)             # <<<<<<<<<<<<<<
+ *     PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
+ *     return a
+ */
+  __pyx_t_5 = PyArray_SimpleNewFromData(1, __pyx_v_shape, NPY_UINT32, __pyx_v__a); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 158, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF_SET(__pyx_v_a, __pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "pycocotools/_mask.pyx":159
+ *     a = np.array((Rs._n, ), dtype=np.uint8)
+ *     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+ *     PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)             # <<<<<<<<<<<<<<
+ *     return a
+ * 
+ */
+  if (!(likely(((__pyx_v_a) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_a, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 159, __pyx_L1_error)
+  PyArray_ENABLEFLAGS(((PyArrayObject *)__pyx_v_a), NPY_OWNDATA);
+
+  /* "pycocotools/_mask.pyx":160
+ *     a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+ *     PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
+ *     return a             # <<<<<<<<<<<<<<
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_a);
+  __pyx_r = __pyx_v_a;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":151
+ *     return obj
+ * 
+ * def area(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("pycocotools._mask.area", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_a);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":163
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):             # <<<<<<<<<<<<<<
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_13iou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_13iou = {"iou", (PyCFunction)__pyx_pw_11pycocotools_5_mask_13iou, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_13iou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_dt = 0;
+  PyObject *__pyx_v_gt = 0;
+  PyObject *__pyx_v_pyiscrowd = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("iou (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_dt,&__pyx_n_s_gt,&__pyx_n_s_pyiscrowd,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_dt)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_gt)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("iou", 1, 3, 3, 1); __PYX_ERR(0, 163, __pyx_L3_error)
+        }
+        case  2:
+        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_pyiscrowd)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("iou", 1, 3, 3, 2); __PYX_ERR(0, 163, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "iou") < 0)) __PYX_ERR(0, 163, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_dt = values[0];
+    __pyx_v_gt = values[1];
+    __pyx_v_pyiscrowd = values[2];
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("iou", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 163, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("pycocotools._mask.iou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_12iou(__pyx_self, __pyx_v_dt, __pyx_v_gt, __pyx_v_pyiscrowd);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":164
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):             # <<<<<<<<<<<<<<
+ *         if len(objs) == 0:
+ *             return objs
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_3iou_1_preproc(PyObject *__pyx_self, PyObject *__pyx_v_objs); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_3iou_1_preproc = {"_preproc", (PyCFunction)__pyx_pw_11pycocotools_5_mask_3iou_1_preproc, METH_O, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_3iou_1_preproc(PyObject *__pyx_self, PyObject *__pyx_v_objs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_preproc (wrapper)", 0);
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_3iou__preproc(__pyx_self, ((PyObject *)__pyx_v_objs));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_3iou__preproc(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_objs) {
+  PyObject *__pyx_v_isbox = NULL;
+  PyObject *__pyx_v_isrle = NULL;
+  PyObject *__pyx_v_obj = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  int __pyx_t_8;
+  int __pyx_t_9;
+  PyObject *__pyx_t_10 = NULL;
+  PyObject *(*__pyx_t_11)(PyObject *);
+  PyObject *__pyx_t_12 = NULL;
+  Py_ssize_t __pyx_t_13;
+  PyObject *__pyx_t_14 = NULL;
+  __Pyx_RefNannySetupContext("_preproc", 0);
+  __Pyx_INCREF(__pyx_v_objs);
+
+  /* "pycocotools/_mask.pyx":165
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):
+ *         if len(objs) == 0:             # <<<<<<<<<<<<<<
+ *             return objs
+ *         if type(objs) == np.ndarray:
+ */
+  __pyx_t_1 = PyObject_Length(__pyx_v_objs); if (unlikely(__pyx_t_1 == -1)) __PYX_ERR(0, 165, __pyx_L1_error)
+  __pyx_t_2 = ((__pyx_t_1 == 0) != 0);
+  if (__pyx_t_2) {
+
+    /* "pycocotools/_mask.pyx":166
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ *             return objs             # <<<<<<<<<<<<<<
+ *         if type(objs) == np.ndarray:
+ *             if len(objs.shape) == 1:
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_v_objs);
+    __pyx_r = __pyx_v_objs;
+    goto __pyx_L0;
+
+    /* "pycocotools/_mask.pyx":165
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):
+ *         if len(objs) == 0:             # <<<<<<<<<<<<<<
+ *             return objs
+ *         if type(objs) == np.ndarray:
+ */
+  }
+
+  /* "pycocotools/_mask.pyx":167
+ *         if len(objs) == 0:
+ *             return objs
+ *         if type(objs) == np.ndarray:             # <<<<<<<<<<<<<<
+ *             if len(objs.shape) == 1:
+ *                 objs = objs.reshape((objs[0], 1))
+ */
+  __pyx_t_3 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_objs)), ((PyObject *)__pyx_ptype_5numpy_ndarray), Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 167, __pyx_L1_error)
+  __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 167, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (__pyx_t_2) {
+
+    /* "pycocotools/_mask.pyx":168
+ *             return objs
+ *         if type(objs) == np.ndarray:
+ *             if len(objs.shape) == 1:             # <<<<<<<<<<<<<<
+ *                 objs = objs.reshape((objs[0], 1))
+ *             # check if it's Nx4 bbox
+ */
+    __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_shape); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 168, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 = PyObject_Length(__pyx_t_3); if (unlikely(__pyx_t_1 == -1)) __PYX_ERR(0, 168, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_2 = ((__pyx_t_1 == 1) != 0);
+    if (__pyx_t_2) {
+
+      /* "pycocotools/_mask.pyx":169
+ *         if type(objs) == np.ndarray:
+ *             if len(objs.shape) == 1:
+ *                 objs = objs.reshape((objs[0], 1))             # <<<<<<<<<<<<<<
+ *             # check if it's Nx4 bbox
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+ */
+      __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_reshape); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 169, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_5 = __Pyx_GetItemInt(__pyx_v_objs, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 169, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 169, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GIVEREF(__pyx_t_5);
+      PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_5);
+      __Pyx_INCREF(__pyx_int_1);
+      __Pyx_GIVEREF(__pyx_int_1);
+      PyTuple_SET_ITEM(__pyx_t_6, 1, __pyx_int_1);
+      __pyx_t_5 = 0;
+      __pyx_t_5 = NULL;
+      if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) {
+        __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4);
+        if (likely(__pyx_t_5)) {
+          PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
+          __Pyx_INCREF(__pyx_t_5);
+          __Pyx_INCREF(function);
+          __Pyx_DECREF_SET(__pyx_t_4, function);
+        }
+      }
+      if (!__pyx_t_5) {
+        __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_6); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 169, __pyx_L1_error)
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+      } else {
+        #if CYTHON_FAST_PYCALL
+        if (PyFunction_Check(__pyx_t_4)) {
+          PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_t_6};
+          __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 169, __pyx_L1_error)
+          __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+          __Pyx_GOTREF(__pyx_t_3);
+          __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+        } else
+        #endif
+        #if CYTHON_FAST_PYCCALL
+        if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) {
+          PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_t_6};
+          __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 169, __pyx_L1_error)
+          __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+          __Pyx_GOTREF(__pyx_t_3);
+          __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+        } else
+        #endif
+        {
+          __pyx_t_7 = PyTuple_New(1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 169, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_7);
+          __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_5); __pyx_t_5 = NULL;
+          __Pyx_GIVEREF(__pyx_t_6);
+          PyTuple_SET_ITEM(__pyx_t_7, 0+1, __pyx_t_6);
+          __pyx_t_6 = 0;
+          __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_7, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 169, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_3);
+          __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+        }
+      }
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_DECREF_SET(__pyx_v_objs, __pyx_t_3);
+      __pyx_t_3 = 0;
+
+      /* "pycocotools/_mask.pyx":168
+ *             return objs
+ *         if type(objs) == np.ndarray:
+ *             if len(objs.shape) == 1:             # <<<<<<<<<<<<<<
+ *                 objs = objs.reshape((objs[0], 1))
+ *             # check if it's Nx4 bbox
+ */
+    }
+
+    /* "pycocotools/_mask.pyx":171
+ *                 objs = objs.reshape((objs[0], 1))
+ *             # check if it's Nx4 bbox
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:             # <<<<<<<<<<<<<<
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+ *             objs = objs.astype(np.double)
+ */
+    __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_shape); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 171, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_1 = PyObject_Length(__pyx_t_3); if (unlikely(__pyx_t_1 == -1)) __PYX_ERR(0, 171, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_8 = ((!((__pyx_t_1 == 2) != 0)) != 0);
+    if (!__pyx_t_8) {
+    } else {
+      __pyx_t_2 = __pyx_t_8;
+      goto __pyx_L7_bool_binop_done;
+    }
+    __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_shape); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 171, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = __Pyx_GetItemInt(__pyx_t_3, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 171, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_3 = __Pyx_PyInt_EqObjC(__pyx_t_4, __pyx_int_4, 4, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 171, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_8 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 171, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_9 = ((!__pyx_t_8) != 0);
+    __pyx_t_2 = __pyx_t_9;
+    __pyx_L7_bool_binop_done:;
+    if (__pyx_t_2) {
+
+      /* "pycocotools/_mask.pyx":172
+ *             # check if it's Nx4 bbox
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')             # <<<<<<<<<<<<<<
+ *             objs = objs.astype(np.double)
+ *         elif type(objs) == list:
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 172, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(0, 172, __pyx_L1_error)
+
+      /* "pycocotools/_mask.pyx":171
+ *                 objs = objs.reshape((objs[0], 1))
+ *             # check if it's Nx4 bbox
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:             # <<<<<<<<<<<<<<
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+ *             objs = objs.astype(np.double)
+ */
+    }
+
+    /* "pycocotools/_mask.pyx":173
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+ *             objs = objs.astype(np.double)             # <<<<<<<<<<<<<<
+ *         elif type(objs) == list:
+ *             # check if list is in box format and convert it to np.ndarray
+ */
+    __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_astype); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 173, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_7 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 173, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_double); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 173, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = NULL;
+    if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) {
+      __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_4);
+      if (likely(__pyx_t_7)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
+        __Pyx_INCREF(__pyx_t_7);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_4, function);
+      }
+    }
+    if (!__pyx_t_7) {
+      __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_6); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 173, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_4)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_6};
+        __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 173, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_6};
+        __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 173, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      } else
+      #endif
+      {
+        __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 173, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_GIVEREF(__pyx_t_7); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_7); __pyx_t_7 = NULL;
+        __Pyx_GIVEREF(__pyx_t_6);
+        PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_6);
+        __pyx_t_6 = 0;
+        __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_5, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 173, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_DECREF_SET(__pyx_v_objs, __pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "pycocotools/_mask.pyx":167
+ *         if len(objs) == 0:
+ *             return objs
+ *         if type(objs) == np.ndarray:             # <<<<<<<<<<<<<<
+ *             if len(objs.shape) == 1:
+ *                 objs = objs.reshape((objs[0], 1))
+ */
+    goto __pyx_L4;
+  }
+
+  /* "pycocotools/_mask.pyx":174
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+ *             objs = objs.astype(np.double)
+ *         elif type(objs) == list:             # <<<<<<<<<<<<<<
+ *             # check if list is in box format and convert it to np.ndarray
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+ */
+  __pyx_t_3 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_objs)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 174, __pyx_L1_error)
+  __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 174, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (__pyx_t_2) {
+
+    /* "pycocotools/_mask.pyx":176
+ *         elif type(objs) == list:
+ *             # check if list is in box format and convert it to np.ndarray
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))             # <<<<<<<<<<<<<<
+ *             isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+ *             if isbox:
+ */
+    __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 176, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_all); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 176, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 176, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_6, __pyx_n_s_array); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 176, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __pyx_t_6 = PyList_New(0); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 176, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    if (likely(PyList_CheckExact(__pyx_v_objs)) || PyTuple_CheckExact(__pyx_v_objs)) {
+      __pyx_t_10 = __pyx_v_objs; __Pyx_INCREF(__pyx_t_10); __pyx_t_1 = 0;
+      __pyx_t_11 = NULL;
+    } else {
+      __pyx_t_1 = -1; __pyx_t_10 = PyObject_GetIter(__pyx_v_objs); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 176, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_10);
+      __pyx_t_11 = Py_TYPE(__pyx_t_10)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 176, __pyx_L1_error)
+    }
+    for (;;) {
+      if (likely(!__pyx_t_11)) {
+        if (likely(PyList_CheckExact(__pyx_t_10))) {
+          if (__pyx_t_1 >= PyList_GET_SIZE(__pyx_t_10)) break;
+          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+          __pyx_t_12 = PyList_GET_ITEM(__pyx_t_10, __pyx_t_1); __Pyx_INCREF(__pyx_t_12); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 176, __pyx_L1_error)
+          #else
+          __pyx_t_12 = PySequence_ITEM(__pyx_t_10, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 176, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_12);
+          #endif
+        } else {
+          if (__pyx_t_1 >= PyTuple_GET_SIZE(__pyx_t_10)) break;
+          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+          __pyx_t_12 = PyTuple_GET_ITEM(__pyx_t_10, __pyx_t_1); __Pyx_INCREF(__pyx_t_12); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 176, __pyx_L1_error)
+          #else
+          __pyx_t_12 = PySequence_ITEM(__pyx_t_10, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 176, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_12);
+          #endif
+        }
+      } else {
+        __pyx_t_12 = __pyx_t_11(__pyx_t_10);
+        if (unlikely(!__pyx_t_12)) {
+          PyObject* exc_type = PyErr_Occurred();
+          if (exc_type) {
+            if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+            else __PYX_ERR(0, 176, __pyx_L1_error)
+          }
+          break;
+        }
+        __Pyx_GOTREF(__pyx_t_12);
+      }
+      __Pyx_XDECREF_SET(__pyx_v_obj, __pyx_t_12);
+      __pyx_t_12 = 0;
+      __pyx_t_13 = PyObject_Length(__pyx_v_obj); if (unlikely(__pyx_t_13 == -1)) __PYX_ERR(0, 176, __pyx_L1_error)
+      __pyx_t_2 = (__pyx_t_13 == 4);
+      if (__pyx_t_2) {
+      } else {
+        __pyx_t_14 = __Pyx_PyBool_FromLong(__pyx_t_2); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 176, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_14);
+        __pyx_t_12 = __pyx_t_14;
+        __pyx_t_14 = 0;
+        goto __pyx_L11_bool_binop_done;
+      }
+      __pyx_t_14 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_obj)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_14); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 176, __pyx_L1_error)
+      __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_14); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 176, __pyx_L1_error)
+      if (!__pyx_t_2) {
+        __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0;
+      } else {
+        __Pyx_INCREF(__pyx_t_14);
+        __pyx_t_12 = __pyx_t_14;
+        __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0;
+        goto __pyx_L11_bool_binop_done;
+      }
+      __pyx_t_14 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_obj)), ((PyObject *)__pyx_ptype_5numpy_ndarray), Py_EQ); __Pyx_XGOTREF(__pyx_t_14); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 176, __pyx_L1_error)
+      __Pyx_INCREF(__pyx_t_14);
+      __pyx_t_12 = __pyx_t_14;
+      __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0;
+      __pyx_L11_bool_binop_done:;
+      if (unlikely(__Pyx_ListComp_Append(__pyx_t_6, (PyObject*)__pyx_t_12))) __PYX_ERR(0, 176, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+    __pyx_t_10 = NULL;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_7))) {
+      __pyx_t_10 = PyMethod_GET_SELF(__pyx_t_7);
+      if (likely(__pyx_t_10)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_7);
+        __Pyx_INCREF(__pyx_t_10);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_7, function);
+      }
+    }
+    if (!__pyx_t_10) {
+      __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_7, __pyx_t_6); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 176, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __Pyx_GOTREF(__pyx_t_4);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_7)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_10, __pyx_t_6};
+        __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_7, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 176, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_10); __pyx_t_10 = 0;
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_7)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_10, __pyx_t_6};
+        __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_7, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 176, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_10); __pyx_t_10 = 0;
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      } else
+      #endif
+      {
+        __pyx_t_12 = PyTuple_New(1+1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 176, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_12);
+        __Pyx_GIVEREF(__pyx_t_10); PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_t_10); __pyx_t_10 = NULL;
+        __Pyx_GIVEREF(__pyx_t_6);
+        PyTuple_SET_ITEM(__pyx_t_12, 0+1, __pyx_t_6);
+        __pyx_t_6 = 0;
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_7, __pyx_t_12, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 176, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = NULL;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_5))) {
+      __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_5);
+      if (likely(__pyx_t_7)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+        __Pyx_INCREF(__pyx_t_7);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_5, function);
+      }
+    }
+    if (!__pyx_t_7) {
+      __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 176, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_5)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_4};
+        __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 176, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_4};
+        __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 176, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      } else
+      #endif
+      {
+        __pyx_t_12 = PyTuple_New(1+1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 176, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_12);
+        __Pyx_GIVEREF(__pyx_t_7); PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_t_7); __pyx_t_7 = NULL;
+        __Pyx_GIVEREF(__pyx_t_4);
+        PyTuple_SET_ITEM(__pyx_t_12, 0+1, __pyx_t_4);
+        __pyx_t_4 = 0;
+        __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_12, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 176, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_v_isbox = __pyx_t_3;
+    __pyx_t_3 = 0;
+
+    /* "pycocotools/_mask.pyx":177
+ *             # check if list is in box format and convert it to np.ndarray
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+ *             isrle = np.all(np.array([type(obj) == dict for obj in objs]))             # <<<<<<<<<<<<<<
+ *             if isbox:
+ *                 objs = np.array(objs, dtype=np.double)
+ */
+    __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 177, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_all); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 177, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_12);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 177, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_array); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 177, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_4 = PyList_New(0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 177, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    if (likely(PyList_CheckExact(__pyx_v_objs)) || PyTuple_CheckExact(__pyx_v_objs)) {
+      __pyx_t_6 = __pyx_v_objs; __Pyx_INCREF(__pyx_t_6); __pyx_t_1 = 0;
+      __pyx_t_11 = NULL;
+    } else {
+      __pyx_t_1 = -1; __pyx_t_6 = PyObject_GetIter(__pyx_v_objs); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 177, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __pyx_t_11 = Py_TYPE(__pyx_t_6)->tp_iternext; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 177, __pyx_L1_error)
+    }
+    for (;;) {
+      if (likely(!__pyx_t_11)) {
+        if (likely(PyList_CheckExact(__pyx_t_6))) {
+          if (__pyx_t_1 >= PyList_GET_SIZE(__pyx_t_6)) break;
+          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+          __pyx_t_10 = PyList_GET_ITEM(__pyx_t_6, __pyx_t_1); __Pyx_INCREF(__pyx_t_10); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 177, __pyx_L1_error)
+          #else
+          __pyx_t_10 = PySequence_ITEM(__pyx_t_6, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 177, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_10);
+          #endif
+        } else {
+          if (__pyx_t_1 >= PyTuple_GET_SIZE(__pyx_t_6)) break;
+          #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+          __pyx_t_10 = PyTuple_GET_ITEM(__pyx_t_6, __pyx_t_1); __Pyx_INCREF(__pyx_t_10); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 177, __pyx_L1_error)
+          #else
+          __pyx_t_10 = PySequence_ITEM(__pyx_t_6, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 177, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_10);
+          #endif
+        }
+      } else {
+        __pyx_t_10 = __pyx_t_11(__pyx_t_6);
+        if (unlikely(!__pyx_t_10)) {
+          PyObject* exc_type = PyErr_Occurred();
+          if (exc_type) {
+            if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+            else __PYX_ERR(0, 177, __pyx_L1_error)
+          }
+          break;
+        }
+        __Pyx_GOTREF(__pyx_t_10);
+      }
+      __Pyx_XDECREF_SET(__pyx_v_obj, __pyx_t_10);
+      __pyx_t_10 = 0;
+      __pyx_t_10 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_obj)), ((PyObject *)(&PyDict_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_10); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 177, __pyx_L1_error)
+      if (unlikely(__Pyx_ListComp_Append(__pyx_t_4, (PyObject*)__pyx_t_10))) __PYX_ERR(0, 177, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __pyx_t_6 = NULL;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_7))) {
+      __pyx_t_6 = PyMethod_GET_SELF(__pyx_t_7);
+      if (likely(__pyx_t_6)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_7);
+        __Pyx_INCREF(__pyx_t_6);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_7, function);
+      }
+    }
+    if (!__pyx_t_6) {
+      __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_7, __pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 177, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_5);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_7)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_t_4};
+        __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_7, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 177, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_7)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_t_4};
+        __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_7, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 177, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0;
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      } else
+      #endif
+      {
+        __pyx_t_10 = PyTuple_New(1+1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 177, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_10);
+        __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_10, 0, __pyx_t_6); __pyx_t_6 = NULL;
+        __Pyx_GIVEREF(__pyx_t_4);
+        PyTuple_SET_ITEM(__pyx_t_10, 0+1, __pyx_t_4);
+        __pyx_t_4 = 0;
+        __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_7, __pyx_t_10, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 177, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = NULL;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_12))) {
+      __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_12);
+      if (likely(__pyx_t_7)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_12);
+        __Pyx_INCREF(__pyx_t_7);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_12, function);
+      }
+    }
+    if (!__pyx_t_7) {
+      __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_12, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 177, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_12)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_5};
+        __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_12, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 177, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_12)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_7, __pyx_t_5};
+        __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_12, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 177, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      } else
+      #endif
+      {
+        __pyx_t_10 = PyTuple_New(1+1); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 177, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_10);
+        __Pyx_GIVEREF(__pyx_t_7); PyTuple_SET_ITEM(__pyx_t_10, 0, __pyx_t_7); __pyx_t_7 = NULL;
+        __Pyx_GIVEREF(__pyx_t_5);
+        PyTuple_SET_ITEM(__pyx_t_10, 0+1, __pyx_t_5);
+        __pyx_t_5 = 0;
+        __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_12, __pyx_t_10, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 177, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+    __pyx_v_isrle = __pyx_t_3;
+    __pyx_t_3 = 0;
+
+    /* "pycocotools/_mask.pyx":178
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+ *             isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+ *             if isbox:             # <<<<<<<<<<<<<<
+ *                 objs = np.array(objs, dtype=np.double)
+ *                 if len(objs.shape) == 1:
+ */
+    __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_v_isbox); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 178, __pyx_L1_error)
+    if (__pyx_t_2) {
+
+      /* "pycocotools/_mask.pyx":179
+ *             isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+ *             if isbox:
+ *                 objs = np.array(objs, dtype=np.double)             # <<<<<<<<<<<<<<
+ *                 if len(objs.shape) == 1:
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ */
+      __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 179, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_array); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 179, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_12);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 179, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_INCREF(__pyx_v_objs);
+      __Pyx_GIVEREF(__pyx_v_objs);
+      PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_v_objs);
+      __pyx_t_10 = PyDict_New(); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 179, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_10);
+      __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 179, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_double); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 179, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+      if (PyDict_SetItem(__pyx_t_10, __pyx_n_s_dtype, __pyx_t_7) < 0) __PYX_ERR(0, 179, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __pyx_t_7 = __Pyx_PyObject_Call(__pyx_t_12, __pyx_t_3, __pyx_t_10); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 179, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+      __Pyx_DECREF_SET(__pyx_v_objs, __pyx_t_7);
+      __pyx_t_7 = 0;
+
+      /* "pycocotools/_mask.pyx":180
+ *             if isbox:
+ *                 objs = np.array(objs, dtype=np.double)
+ *                 if len(objs.shape) == 1:             # <<<<<<<<<<<<<<
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ *             elif isrle:
+ */
+      __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_shape); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 180, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __pyx_t_1 = PyObject_Length(__pyx_t_7); if (unlikely(__pyx_t_1 == -1)) __PYX_ERR(0, 180, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __pyx_t_2 = ((__pyx_t_1 == 1) != 0);
+      if (__pyx_t_2) {
+
+        /* "pycocotools/_mask.pyx":181
+ *                 objs = np.array(objs, dtype=np.double)
+ *                 if len(objs.shape) == 1:
+ *                     objs = objs.reshape((1,objs.shape[0]))             # <<<<<<<<<<<<<<
+ *             elif isrle:
+ *                 objs = _frString(objs)
+ */
+        __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_reshape); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 181, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_10);
+        __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_objs, __pyx_n_s_shape); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 181, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __pyx_t_12 = __Pyx_GetItemInt(__pyx_t_3, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 181, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_12);
+        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 181, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_INCREF(__pyx_int_1);
+        __Pyx_GIVEREF(__pyx_int_1);
+        PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_int_1);
+        __Pyx_GIVEREF(__pyx_t_12);
+        PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_12);
+        __pyx_t_12 = 0;
+        __pyx_t_12 = NULL;
+        if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_10))) {
+          __pyx_t_12 = PyMethod_GET_SELF(__pyx_t_10);
+          if (likely(__pyx_t_12)) {
+            PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_10);
+            __Pyx_INCREF(__pyx_t_12);
+            __Pyx_INCREF(function);
+            __Pyx_DECREF_SET(__pyx_t_10, function);
+          }
+        }
+        if (!__pyx_t_12) {
+          __pyx_t_7 = __Pyx_PyObject_CallOneArg(__pyx_t_10, __pyx_t_3); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 181, __pyx_L1_error)
+          __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+          __Pyx_GOTREF(__pyx_t_7);
+        } else {
+          #if CYTHON_FAST_PYCALL
+          if (PyFunction_Check(__pyx_t_10)) {
+            PyObject *__pyx_temp[2] = {__pyx_t_12, __pyx_t_3};
+            __pyx_t_7 = __Pyx_PyFunction_FastCall(__pyx_t_10, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 181, __pyx_L1_error)
+            __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0;
+            __Pyx_GOTREF(__pyx_t_7);
+            __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+          } else
+          #endif
+          #if CYTHON_FAST_PYCCALL
+          if (__Pyx_PyFastCFunction_Check(__pyx_t_10)) {
+            PyObject *__pyx_temp[2] = {__pyx_t_12, __pyx_t_3};
+            __pyx_t_7 = __Pyx_PyCFunction_FastCall(__pyx_t_10, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 181, __pyx_L1_error)
+            __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0;
+            __Pyx_GOTREF(__pyx_t_7);
+            __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+          } else
+          #endif
+          {
+            __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 181, __pyx_L1_error)
+            __Pyx_GOTREF(__pyx_t_5);
+            __Pyx_GIVEREF(__pyx_t_12); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_12); __pyx_t_12 = NULL;
+            __Pyx_GIVEREF(__pyx_t_3);
+            PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_3);
+            __pyx_t_3 = 0;
+            __pyx_t_7 = __Pyx_PyObject_Call(__pyx_t_10, __pyx_t_5, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 181, __pyx_L1_error)
+            __Pyx_GOTREF(__pyx_t_7);
+            __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+          }
+        }
+        __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+        __Pyx_DECREF_SET(__pyx_v_objs, __pyx_t_7);
+        __pyx_t_7 = 0;
+
+        /* "pycocotools/_mask.pyx":180
+ *             if isbox:
+ *                 objs = np.array(objs, dtype=np.double)
+ *                 if len(objs.shape) == 1:             # <<<<<<<<<<<<<<
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ *             elif isrle:
+ */
+      }
+
+      /* "pycocotools/_mask.pyx":178
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+ *             isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+ *             if isbox:             # <<<<<<<<<<<<<<
+ *                 objs = np.array(objs, dtype=np.double)
+ *                 if len(objs.shape) == 1:
+ */
+      goto __pyx_L16;
+    }
+
+    /* "pycocotools/_mask.pyx":182
+ *                 if len(objs.shape) == 1:
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ *             elif isrle:             # <<<<<<<<<<<<<<
+ *                 objs = _frString(objs)
+ *             else:
+ */
+    __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_v_isrle); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 182, __pyx_L1_error)
+    if (__pyx_t_2) {
+
+      /* "pycocotools/_mask.pyx":183
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ *             elif isrle:
+ *                 objs = _frString(objs)             # <<<<<<<<<<<<<<
+ *             else:
+ *                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
+ */
+      __pyx_t_10 = __Pyx_GetModuleGlobalName(__pyx_n_s_frString); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 183, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_10);
+      __pyx_t_5 = NULL;
+      if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_10))) {
+        __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_10);
+        if (likely(__pyx_t_5)) {
+          PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_10);
+          __Pyx_INCREF(__pyx_t_5);
+          __Pyx_INCREF(function);
+          __Pyx_DECREF_SET(__pyx_t_10, function);
+        }
+      }
+      if (!__pyx_t_5) {
+        __pyx_t_7 = __Pyx_PyObject_CallOneArg(__pyx_t_10, __pyx_v_objs); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 183, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_7);
+      } else {
+        #if CYTHON_FAST_PYCALL
+        if (PyFunction_Check(__pyx_t_10)) {
+          PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_v_objs};
+          __pyx_t_7 = __Pyx_PyFunction_FastCall(__pyx_t_10, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 183, __pyx_L1_error)
+          __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+          __Pyx_GOTREF(__pyx_t_7);
+        } else
+        #endif
+        #if CYTHON_FAST_PYCCALL
+        if (__Pyx_PyFastCFunction_Check(__pyx_t_10)) {
+          PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_v_objs};
+          __pyx_t_7 = __Pyx_PyCFunction_FastCall(__pyx_t_10, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 183, __pyx_L1_error)
+          __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+          __Pyx_GOTREF(__pyx_t_7);
+        } else
+        #endif
+        {
+          __pyx_t_3 = PyTuple_New(1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 183, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_3);
+          __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_5); __pyx_t_5 = NULL;
+          __Pyx_INCREF(__pyx_v_objs);
+          __Pyx_GIVEREF(__pyx_v_objs);
+          PyTuple_SET_ITEM(__pyx_t_3, 0+1, __pyx_v_objs);
+          __pyx_t_7 = __Pyx_PyObject_Call(__pyx_t_10, __pyx_t_3, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 183, __pyx_L1_error)
+          __Pyx_GOTREF(__pyx_t_7);
+          __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        }
+      }
+      __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+      __Pyx_DECREF_SET(__pyx_v_objs, __pyx_t_7);
+      __pyx_t_7 = 0;
+
+      /* "pycocotools/_mask.pyx":182
+ *                 if len(objs.shape) == 1:
+ *                     objs = objs.reshape((1,objs.shape[0]))
+ *             elif isrle:             # <<<<<<<<<<<<<<
+ *                 objs = _frString(objs)
+ *             else:
+ */
+      goto __pyx_L16;
+    }
+
+    /* "pycocotools/_mask.pyx":185
+ *                 objs = _frString(objs)
+ *             else:
+ *                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')             # <<<<<<<<<<<<<<
+ *         else:
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ */
+    /*else*/ {
+      __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 185, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_Raise(__pyx_t_7, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __PYX_ERR(0, 185, __pyx_L1_error)
+    }
+    __pyx_L16:;
+
+    /* "pycocotools/_mask.pyx":174
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+ *             objs = objs.astype(np.double)
+ *         elif type(objs) == list:             # <<<<<<<<<<<<<<
+ *             # check if list is in box format and convert it to np.ndarray
+ *             isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+ */
+    goto __pyx_L4;
+  }
+
+  /* "pycocotools/_mask.pyx":187
+ *                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
+ *         else:
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')             # <<<<<<<<<<<<<<
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ */
+  /*else*/ {
+    __pyx_t_7 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 187, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_Raise(__pyx_t_7, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __PYX_ERR(0, 187, __pyx_L1_error)
+  }
+  __pyx_L4:;
+
+  /* "pycocotools/_mask.pyx":188
+ *         else:
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ *         return objs             # <<<<<<<<<<<<<<
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":164
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):             # <<<<<<<<<<<<<<
+ *         if len(objs) == 0:
+ *             return objs
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_10);
+  __Pyx_XDECREF(__pyx_t_12);
+  __Pyx_XDECREF(__pyx_t_14);
+  __Pyx_AddTraceback("pycocotools._mask.iou._preproc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_isbox);
+  __Pyx_XDECREF(__pyx_v_isrle);
+  __Pyx_XDECREF(__pyx_v_obj);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":189
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_3iou_3_rleIou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_3iou_3_rleIou = {"_rleIou", (PyCFunction)__pyx_pw_11pycocotools_5_mask_3iou_3_rleIou, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_3iou_3_rleIou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_dt = 0;
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_gt = 0;
+  PyArrayObject *__pyx_v_iscrowd = 0;
+  siz __pyx_v_m;
+  siz __pyx_v_n;
+  PyArrayObject *__pyx_v__iou = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_rleIou (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_dt,&__pyx_n_s_gt,&__pyx_n_s_iscrowd,&__pyx_n_s_m,&__pyx_n_s_n,&__pyx_n_s_iou,0};
+    PyObject* values[6] = {0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_dt)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_gt)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, 1); __PYX_ERR(0, 189, __pyx_L3_error)
+        }
+        case  2:
+        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_iscrowd)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, 2); __PYX_ERR(0, 189, __pyx_L3_error)
+        }
+        case  3:
+        if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_m)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, 3); __PYX_ERR(0, 189, __pyx_L3_error)
+        }
+        case  4:
+        if (likely((values[4] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, 4); __PYX_ERR(0, 189, __pyx_L3_error)
+        }
+        case  5:
+        if (likely((values[5] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_iou)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, 5); __PYX_ERR(0, 189, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "_rleIou") < 0)) __PYX_ERR(0, 189, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 6) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+    }
+    __pyx_v_dt = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)values[0]);
+    __pyx_v_gt = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)values[1]);
+    __pyx_v_iscrowd = ((PyArrayObject *)values[2]);
+    __pyx_v_m = __Pyx_PyInt_As_siz(values[3]); if (unlikely((__pyx_v_m == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 189, __pyx_L3_error)
+    __pyx_v_n = __Pyx_PyInt_As_siz(values[4]); if (unlikely((__pyx_v_n == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 189, __pyx_L3_error)
+    __pyx_v__iou = ((PyArrayObject *)values[5]);
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("_rleIou", 1, 6, 6, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 189, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("pycocotools._mask.iou._rleIou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_dt), __pyx_ptype_11pycocotools_5_mask_RLEs, 1, "dt", 0))) __PYX_ERR(0, 189, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_gt), __pyx_ptype_11pycocotools_5_mask_RLEs, 1, "gt", 0))) __PYX_ERR(0, 189, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_iscrowd), __pyx_ptype_5numpy_ndarray, 1, "iscrowd", 0))) __PYX_ERR(0, 189, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v__iou), __pyx_ptype_5numpy_ndarray, 1, "_iou", 0))) __PYX_ERR(0, 189, __pyx_L1_error)
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_3iou_2_rleIou(__pyx_self, __pyx_v_dt, __pyx_v_gt, __pyx_v_iscrowd, __pyx_v_m, __pyx_v_n, __pyx_v__iou);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_3iou_2_rleIou(CYTHON_UNUSED PyObject *__pyx_self, struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_dt, struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_gt, PyArrayObject *__pyx_v_iscrowd, siz __pyx_v_m, siz __pyx_v_n, PyArrayObject *__pyx_v__iou) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd__iou;
+  __Pyx_Buffer __pyx_pybuffer__iou;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_iscrowd;
+  __Pyx_Buffer __pyx_pybuffer_iscrowd;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_rleIou", 0);
+  __pyx_pybuffer_iscrowd.pybuffer.buf = NULL;
+  __pyx_pybuffer_iscrowd.refcount = 0;
+  __pyx_pybuffernd_iscrowd.data = NULL;
+  __pyx_pybuffernd_iscrowd.rcbuffer = &__pyx_pybuffer_iscrowd;
+  __pyx_pybuffer__iou.pybuffer.buf = NULL;
+  __pyx_pybuffer__iou.refcount = 0;
+  __pyx_pybuffernd__iou.data = NULL;
+  __pyx_pybuffernd__iou.rcbuffer = &__pyx_pybuffer__iou;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer, (PyObject*)__pyx_v_iscrowd, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint8_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 189, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_iscrowd.diminfo[0].strides = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_iscrowd.diminfo[0].shape = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.shape[0];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd__iou.rcbuffer->pybuffer, (PyObject*)__pyx_v__iou, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 189, __pyx_L1_error)
+  }
+  __pyx_pybuffernd__iou.diminfo[0].strides = __pyx_pybuffernd__iou.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd__iou.diminfo[0].shape = __pyx_pybuffernd__iou.rcbuffer->pybuffer.shape[0];
+
+  /* "pycocotools/_mask.pyx":190
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )             # <<<<<<<<<<<<<<
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ */
+  rleIou(((RLE *)__pyx_v_dt->_R), ((RLE *)__pyx_v_gt->_R), __pyx_v_m, __pyx_v_n, ((byte *)__pyx_v_iscrowd->data), ((double *)__pyx_v__iou->data));
+
+  /* "pycocotools/_mask.pyx":189
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd__iou.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("pycocotools._mask.iou._rleIou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd__iou.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":191
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_3iou_5_bbIou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_3iou_5_bbIou = {"_bbIou", (PyCFunction)__pyx_pw_11pycocotools_5_mask_3iou_5_bbIou, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_3iou_5_bbIou(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_dt = 0;
+  PyArrayObject *__pyx_v_gt = 0;
+  PyArrayObject *__pyx_v_iscrowd = 0;
+  siz __pyx_v_m;
+  siz __pyx_v_n;
+  PyArrayObject *__pyx_v__iou = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_bbIou (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_dt,&__pyx_n_s_gt,&__pyx_n_s_iscrowd,&__pyx_n_s_m,&__pyx_n_s_n,&__pyx_n_s_iou,0};
+    PyObject* values[6] = {0,0,0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+        case  5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_dt)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_gt)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, 1); __PYX_ERR(0, 191, __pyx_L3_error)
+        }
+        case  2:
+        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_iscrowd)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, 2); __PYX_ERR(0, 191, __pyx_L3_error)
+        }
+        case  3:
+        if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_m)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, 3); __PYX_ERR(0, 191, __pyx_L3_error)
+        }
+        case  4:
+        if (likely((values[4] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_n)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, 4); __PYX_ERR(0, 191, __pyx_L3_error)
+        }
+        case  5:
+        if (likely((values[5] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_iou)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, 5); __PYX_ERR(0, 191, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "_bbIou") < 0)) __PYX_ERR(0, 191, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 6) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+      values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+      values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+    }
+    __pyx_v_dt = ((PyArrayObject *)values[0]);
+    __pyx_v_gt = ((PyArrayObject *)values[1]);
+    __pyx_v_iscrowd = ((PyArrayObject *)values[2]);
+    __pyx_v_m = __Pyx_PyInt_As_siz(values[3]); if (unlikely((__pyx_v_m == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 191, __pyx_L3_error)
+    __pyx_v_n = __Pyx_PyInt_As_siz(values[4]); if (unlikely((__pyx_v_n == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 191, __pyx_L3_error)
+    __pyx_v__iou = ((PyArrayObject *)values[5]);
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("_bbIou", 1, 6, 6, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 191, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("pycocotools._mask.iou._bbIou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_dt), __pyx_ptype_5numpy_ndarray, 1, "dt", 0))) __PYX_ERR(0, 191, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_gt), __pyx_ptype_5numpy_ndarray, 1, "gt", 0))) __PYX_ERR(0, 191, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_iscrowd), __pyx_ptype_5numpy_ndarray, 1, "iscrowd", 0))) __PYX_ERR(0, 191, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v__iou), __pyx_ptype_5numpy_ndarray, 1, "_iou", 0))) __PYX_ERR(0, 191, __pyx_L1_error)
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_3iou_4_bbIou(__pyx_self, __pyx_v_dt, __pyx_v_gt, __pyx_v_iscrowd, __pyx_v_m, __pyx_v_n, __pyx_v__iou);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_3iou_4_bbIou(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_dt, PyArrayObject *__pyx_v_gt, PyArrayObject *__pyx_v_iscrowd, siz __pyx_v_m, siz __pyx_v_n, PyArrayObject *__pyx_v__iou) {
+  __Pyx_LocalBuf_ND __pyx_pybuffernd__iou;
+  __Pyx_Buffer __pyx_pybuffer__iou;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_dt;
+  __Pyx_Buffer __pyx_pybuffer_dt;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_gt;
+  __Pyx_Buffer __pyx_pybuffer_gt;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_iscrowd;
+  __Pyx_Buffer __pyx_pybuffer_iscrowd;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_bbIou", 0);
+  __pyx_pybuffer_dt.pybuffer.buf = NULL;
+  __pyx_pybuffer_dt.refcount = 0;
+  __pyx_pybuffernd_dt.data = NULL;
+  __pyx_pybuffernd_dt.rcbuffer = &__pyx_pybuffer_dt;
+  __pyx_pybuffer_gt.pybuffer.buf = NULL;
+  __pyx_pybuffer_gt.refcount = 0;
+  __pyx_pybuffernd_gt.data = NULL;
+  __pyx_pybuffernd_gt.rcbuffer = &__pyx_pybuffer_gt;
+  __pyx_pybuffer_iscrowd.pybuffer.buf = NULL;
+  __pyx_pybuffer_iscrowd.refcount = 0;
+  __pyx_pybuffernd_iscrowd.data = NULL;
+  __pyx_pybuffernd_iscrowd.rcbuffer = &__pyx_pybuffer_iscrowd;
+  __pyx_pybuffer__iou.pybuffer.buf = NULL;
+  __pyx_pybuffer__iou.refcount = 0;
+  __pyx_pybuffernd__iou.data = NULL;
+  __pyx_pybuffernd__iou.rcbuffer = &__pyx_pybuffer__iou;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_dt.rcbuffer->pybuffer, (PyObject*)__pyx_v_dt, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 191, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_dt.diminfo[0].strides = __pyx_pybuffernd_dt.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_dt.diminfo[0].shape = __pyx_pybuffernd_dt.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_dt.diminfo[1].strides = __pyx_pybuffernd_dt.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_dt.diminfo[1].shape = __pyx_pybuffernd_dt.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_gt.rcbuffer->pybuffer, (PyObject*)__pyx_v_gt, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 191, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_gt.diminfo[0].strides = __pyx_pybuffernd_gt.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_gt.diminfo[0].shape = __pyx_pybuffernd_gt.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_gt.diminfo[1].strides = __pyx_pybuffernd_gt.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_gt.diminfo[1].shape = __pyx_pybuffernd_gt.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer, (PyObject*)__pyx_v_iscrowd, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint8_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 191, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_iscrowd.diminfo[0].strides = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_iscrowd.diminfo[0].shape = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.shape[0];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd__iou.rcbuffer->pybuffer, (PyObject*)__pyx_v__iou, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) __PYX_ERR(0, 191, __pyx_L1_error)
+  }
+  __pyx_pybuffernd__iou.diminfo[0].strides = __pyx_pybuffernd__iou.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd__iou.diminfo[0].shape = __pyx_pybuffernd__iou.rcbuffer->pybuffer.shape[0];
+
+  /* "pycocotools/_mask.pyx":192
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )             # <<<<<<<<<<<<<<
+ *     def _len(obj):
+ *         cdef siz N = 0
+ */
+  bbIou(((BB)__pyx_v_dt->data), ((BB)__pyx_v_gt->data), __pyx_v_m, __pyx_v_n, ((byte *)__pyx_v_iscrowd->data), ((double *)__pyx_v__iou->data));
+
+  /* "pycocotools/_mask.pyx":191
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd__iou.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_dt.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_gt.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("pycocotools._mask.iou._bbIou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd__iou.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_dt.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_gt.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":193
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):             # <<<<<<<<<<<<<<
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_3iou_7_len(PyObject *__pyx_self, PyObject *__pyx_v_obj); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_3iou_7_len = {"_len", (PyCFunction)__pyx_pw_11pycocotools_5_mask_3iou_7_len, METH_O, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_3iou_7_len(PyObject *__pyx_self, PyObject *__pyx_v_obj) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_len (wrapper)", 0);
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_3iou_6_len(__pyx_self, ((PyObject *)__pyx_v_obj));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_3iou_6_len(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj) {
+  siz __pyx_v_N;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  siz __pyx_t_3;
+  Py_ssize_t __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  __Pyx_RefNannySetupContext("_len", 0);
+
+  /* "pycocotools/_mask.pyx":194
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):
+ *         cdef siz N = 0             # <<<<<<<<<<<<<<
+ *         if type(obj) == RLEs:
+ *             N = obj.n
+ */
+  __pyx_v_N = 0;
+
+  /* "pycocotools/_mask.pyx":195
+ *     def _len(obj):
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:             # <<<<<<<<<<<<<<
+ *             N = obj.n
+ *         elif len(obj)==0:
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_obj)), ((PyObject *)__pyx_ptype_11pycocotools_5_mask_RLEs), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 195, __pyx_L1_error)
+  __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 195, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_2) {
+
+    /* "pycocotools/_mask.pyx":196
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:
+ *             N = obj.n             # <<<<<<<<<<<<<<
+ *         elif len(obj)==0:
+ *             pass
+ */
+    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_obj, __pyx_n_s_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 196, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_3 = __Pyx_PyInt_As_siz(__pyx_t_1); if (unlikely((__pyx_t_3 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 196, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_v_N = __pyx_t_3;
+
+    /* "pycocotools/_mask.pyx":195
+ *     def _len(obj):
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:             # <<<<<<<<<<<<<<
+ *             N = obj.n
+ *         elif len(obj)==0:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "pycocotools/_mask.pyx":197
+ *         if type(obj) == RLEs:
+ *             N = obj.n
+ *         elif len(obj)==0:             # <<<<<<<<<<<<<<
+ *             pass
+ *         elif type(obj) == np.ndarray:
+ */
+  __pyx_t_4 = PyObject_Length(__pyx_v_obj); if (unlikely(__pyx_t_4 == -1)) __PYX_ERR(0, 197, __pyx_L1_error)
+  __pyx_t_2 = ((__pyx_t_4 == 0) != 0);
+  if (__pyx_t_2) {
+    goto __pyx_L3;
+  }
+
+  /* "pycocotools/_mask.pyx":199
+ *         elif len(obj)==0:
+ *             pass
+ *         elif type(obj) == np.ndarray:             # <<<<<<<<<<<<<<
+ *             N = obj.shape[0]
+ *         return N
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_obj)), ((PyObject *)__pyx_ptype_5numpy_ndarray), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 199, __pyx_L1_error)
+  __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 199, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_2) {
+
+    /* "pycocotools/_mask.pyx":200
+ *             pass
+ *         elif type(obj) == np.ndarray:
+ *             N = obj.shape[0]             # <<<<<<<<<<<<<<
+ *         return N
+ *     # convert iscrowd to numpy array
+ */
+    __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_obj, __pyx_n_s_shape); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 200, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_5 = __Pyx_GetItemInt(__pyx_t_1, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 200, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_t_3 = __Pyx_PyInt_As_siz(__pyx_t_5); if (unlikely((__pyx_t_3 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 200, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_v_N = __pyx_t_3;
+
+    /* "pycocotools/_mask.pyx":199
+ *         elif len(obj)==0:
+ *             pass
+ *         elif type(obj) == np.ndarray:             # <<<<<<<<<<<<<<
+ *             N = obj.shape[0]
+ *         return N
+ */
+  }
+  __pyx_L3:;
+
+  /* "pycocotools/_mask.pyx":201
+ *         elif type(obj) == np.ndarray:
+ *             N = obj.shape[0]
+ *         return N             # <<<<<<<<<<<<<<
+ *     # convert iscrowd to numpy array
+ *     cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8)
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_5 = __Pyx_PyInt_From_siz(__pyx_v_N); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 201, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_r = __pyx_t_5;
+  __pyx_t_5 = 0;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":193
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):             # <<<<<<<<<<<<<<
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("pycocotools._mask.iou._len", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":163
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):             # <<<<<<<<<<<<<<
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ */
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_12iou(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_dt, PyObject *__pyx_v_gt, PyObject *__pyx_v_pyiscrowd) {
+  PyObject *__pyx_v__preproc = 0;
+  PyObject *__pyx_v__rleIou = 0;
+  PyObject *__pyx_v__bbIou = 0;
+  PyObject *__pyx_v__len = 0;
+  PyArrayObject *__pyx_v_iscrowd = 0;
+  siz __pyx_v_m;
+  siz __pyx_v_n;
+  double *__pyx_v__iou;
+  npy_intp __pyx_v_shape[1];
+  PyObject *__pyx_v__iouFun = NULL;
+  PyObject *__pyx_v_iou = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_iscrowd;
+  __Pyx_Buffer __pyx_pybuffer_iscrowd;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyArrayObject *__pyx_t_6 = NULL;
+  siz __pyx_t_7;
+  int __pyx_t_8;
+  int __pyx_t_9;
+  int __pyx_t_10;
+  PyObject *__pyx_t_11 = NULL;
+  __Pyx_RefNannySetupContext("iou", 0);
+  __Pyx_INCREF(__pyx_v_dt);
+  __Pyx_INCREF(__pyx_v_gt);
+  __pyx_pybuffer_iscrowd.pybuffer.buf = NULL;
+  __pyx_pybuffer_iscrowd.refcount = 0;
+  __pyx_pybuffernd_iscrowd.data = NULL;
+  __pyx_pybuffernd_iscrowd.rcbuffer = &__pyx_pybuffer_iscrowd;
+
+  /* "pycocotools/_mask.pyx":164
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):             # <<<<<<<<<<<<<<
+ *         if len(objs) == 0:
+ *             return objs
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_3iou_1_preproc, 0, __pyx_n_s_iou_locals__preproc, NULL, __pyx_n_s_pycocotools__mask, __pyx_d, ((PyObject *)__pyx_codeobj__6)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 164, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v__preproc = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":189
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_3iou_3_rleIou, 0, __pyx_n_s_iou_locals__rleIou, NULL, __pyx_n_s_pycocotools__mask, __pyx_d, ((PyObject *)__pyx_codeobj__8)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v__rleIou = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":191
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_3iou_5_bbIou, 0, __pyx_n_s_iou_locals__bbIou, NULL, __pyx_n_s_pycocotools__mask, __pyx_d, ((PyObject *)__pyx_codeobj__10)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 191, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v__bbIou = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":193
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):             # <<<<<<<<<<<<<<
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_3iou_7_len, 0, __pyx_n_s_iou_locals__len, NULL, __pyx_n_s_pycocotools__mask, __pyx_d, ((PyObject *)__pyx_codeobj__12)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 193, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_v__len = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":203
+ *         return N
+ *     # convert iscrowd to numpy array
+ *     cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8)             # <<<<<<<<<<<<<<
+ *     # simple type checking
+ *     cdef siz m, n
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 203, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_array); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 203, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 203, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_INCREF(__pyx_v_pyiscrowd);
+  __Pyx_GIVEREF(__pyx_v_pyiscrowd);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_pyiscrowd);
+  __pyx_t_3 = PyDict_New(); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 203, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 203, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_uint8); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 203, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_dtype, __pyx_t_5) < 0) __PYX_ERR(0, 203, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 203, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (!(likely(((__pyx_t_5) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_5, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 203, __pyx_L1_error)
+  __pyx_t_6 = ((PyArrayObject *)__pyx_t_5);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer, (PyObject*)__pyx_t_6, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint8_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_iscrowd = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.buf = NULL;
+      __PYX_ERR(0, 203, __pyx_L1_error)
+    } else {__pyx_pybuffernd_iscrowd.diminfo[0].strides = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_iscrowd.diminfo[0].shape = __pyx_pybuffernd_iscrowd.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_6 = 0;
+  __pyx_v_iscrowd = ((PyArrayObject *)__pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "pycocotools/_mask.pyx":206
+ *     # simple type checking
+ *     cdef siz m, n
+ *     dt = _preproc(dt)             # <<<<<<<<<<<<<<
+ *     gt = _preproc(gt)
+ *     m = _len(dt)
+ */
+  __pyx_t_5 = __pyx_pf_11pycocotools_5_mask_3iou__preproc(__pyx_v__preproc, __pyx_v_dt); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 206, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF_SET(__pyx_v_dt, __pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "pycocotools/_mask.pyx":207
+ *     cdef siz m, n
+ *     dt = _preproc(dt)
+ *     gt = _preproc(gt)             # <<<<<<<<<<<<<<
+ *     m = _len(dt)
+ *     n = _len(gt)
+ */
+  __pyx_t_5 = __pyx_pf_11pycocotools_5_mask_3iou__preproc(__pyx_v__preproc, __pyx_v_gt); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 207, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF_SET(__pyx_v_gt, __pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "pycocotools/_mask.pyx":208
+ *     dt = _preproc(dt)
+ *     gt = _preproc(gt)
+ *     m = _len(dt)             # <<<<<<<<<<<<<<
+ *     n = _len(gt)
+ *     if m == 0 or n == 0:
+ */
+  __pyx_t_5 = __pyx_pf_11pycocotools_5_mask_3iou_6_len(__pyx_v__len, __pyx_v_dt); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 208, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_7 = __Pyx_PyInt_As_siz(__pyx_t_5); if (unlikely((__pyx_t_7 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 208, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_v_m = __pyx_t_7;
+
+  /* "pycocotools/_mask.pyx":209
+ *     gt = _preproc(gt)
+ *     m = _len(dt)
+ *     n = _len(gt)             # <<<<<<<<<<<<<<
+ *     if m == 0 or n == 0:
+ *         return []
+ */
+  __pyx_t_5 = __pyx_pf_11pycocotools_5_mask_3iou_6_len(__pyx_v__len, __pyx_v_gt); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 209, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_7 = __Pyx_PyInt_As_siz(__pyx_t_5); if (unlikely((__pyx_t_7 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 209, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_v_n = __pyx_t_7;
+
+  /* "pycocotools/_mask.pyx":210
+ *     m = _len(dt)
+ *     n = _len(gt)
+ *     if m == 0 or n == 0:             # <<<<<<<<<<<<<<
+ *         return []
+ *     if not type(dt) == type(gt):
+ */
+  __pyx_t_9 = ((__pyx_v_m == 0) != 0);
+  if (!__pyx_t_9) {
+  } else {
+    __pyx_t_8 = __pyx_t_9;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_9 = ((__pyx_v_n == 0) != 0);
+  __pyx_t_8 = __pyx_t_9;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_8) {
+
+    /* "pycocotools/_mask.pyx":211
+ *     n = _len(gt)
+ *     if m == 0 or n == 0:
+ *         return []             # <<<<<<<<<<<<<<
+ *     if not type(dt) == type(gt):
+ *         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_t_5 = PyList_New(0); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 211, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_r = __pyx_t_5;
+    __pyx_t_5 = 0;
+    goto __pyx_L0;
+
+    /* "pycocotools/_mask.pyx":210
+ *     m = _len(dt)
+ *     n = _len(gt)
+ *     if m == 0 or n == 0:             # <<<<<<<<<<<<<<
+ *         return []
+ *     if not type(dt) == type(gt):
+ */
+  }
+
+  /* "pycocotools/_mask.pyx":212
+ *     if m == 0 or n == 0:
+ *         return []
+ *     if not type(dt) == type(gt):             # <<<<<<<<<<<<<<
+ *         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
+ * 
+ */
+  __pyx_t_5 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_dt)), ((PyObject *)Py_TYPE(__pyx_v_gt)), Py_EQ); __Pyx_XGOTREF(__pyx_t_5); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 212, __pyx_L1_error)
+  __pyx_t_8 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 212, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_9 = ((!__pyx_t_8) != 0);
+  if (__pyx_t_9) {
+
+    /* "pycocotools/_mask.pyx":213
+ *         return []
+ *     if not type(dt) == type(gt):
+ *         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')             # <<<<<<<<<<<<<<
+ * 
+ *     # define local variables
+ */
+    __pyx_t_5 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__13, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 213, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_Raise(__pyx_t_5, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __PYX_ERR(0, 213, __pyx_L1_error)
+
+    /* "pycocotools/_mask.pyx":212
+ *     if m == 0 or n == 0:
+ *         return []
+ *     if not type(dt) == type(gt):             # <<<<<<<<<<<<<<
+ *         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
+ * 
+ */
+  }
+
+  /* "pycocotools/_mask.pyx":216
+ * 
+ *     # define local variables
+ *     cdef double* _iou = <double*> 0             # <<<<<<<<<<<<<<
+ *     cdef np.npy_intp shape[1]
+ *     # check type and assign iou function
+ */
+  __pyx_v__iou = ((double *)0);
+
+  /* "pycocotools/_mask.pyx":219
+ *     cdef np.npy_intp shape[1]
+ *     # check type and assign iou function
+ *     if type(dt) == RLEs:             # <<<<<<<<<<<<<<
+ *         _iouFun = _rleIou
+ *     elif type(dt) == np.ndarray:
+ */
+  __pyx_t_5 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_dt)), ((PyObject *)__pyx_ptype_11pycocotools_5_mask_RLEs), Py_EQ); __Pyx_XGOTREF(__pyx_t_5); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 219, __pyx_L1_error)
+  __pyx_t_9 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_9 < 0)) __PYX_ERR(0, 219, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  if (__pyx_t_9) {
+
+    /* "pycocotools/_mask.pyx":220
+ *     # check type and assign iou function
+ *     if type(dt) == RLEs:
+ *         _iouFun = _rleIou             # <<<<<<<<<<<<<<
+ *     elif type(dt) == np.ndarray:
+ *         _iouFun = _bbIou
+ */
+    __Pyx_INCREF(__pyx_v__rleIou);
+    __pyx_v__iouFun = __pyx_v__rleIou;
+
+    /* "pycocotools/_mask.pyx":219
+ *     cdef np.npy_intp shape[1]
+ *     # check type and assign iou function
+ *     if type(dt) == RLEs:             # <<<<<<<<<<<<<<
+ *         _iouFun = _rleIou
+ *     elif type(dt) == np.ndarray:
+ */
+    goto __pyx_L7;
+  }
+
+  /* "pycocotools/_mask.pyx":221
+ *     if type(dt) == RLEs:
+ *         _iouFun = _rleIou
+ *     elif type(dt) == np.ndarray:             # <<<<<<<<<<<<<<
+ *         _iouFun = _bbIou
+ *     else:
+ */
+  __pyx_t_5 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_dt)), ((PyObject *)__pyx_ptype_5numpy_ndarray), Py_EQ); __Pyx_XGOTREF(__pyx_t_5); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 221, __pyx_L1_error)
+  __pyx_t_9 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_9 < 0)) __PYX_ERR(0, 221, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  if (__pyx_t_9) {
+
+    /* "pycocotools/_mask.pyx":222
+ *         _iouFun = _rleIou
+ *     elif type(dt) == np.ndarray:
+ *         _iouFun = _bbIou             # <<<<<<<<<<<<<<
+ *     else:
+ *         raise Exception('input data type not allowed.')
+ */
+    __Pyx_INCREF(__pyx_v__bbIou);
+    __pyx_v__iouFun = __pyx_v__bbIou;
+
+    /* "pycocotools/_mask.pyx":221
+ *     if type(dt) == RLEs:
+ *         _iouFun = _rleIou
+ *     elif type(dt) == np.ndarray:             # <<<<<<<<<<<<<<
+ *         _iouFun = _bbIou
+ *     else:
+ */
+    goto __pyx_L7;
+  }
+
+  /* "pycocotools/_mask.pyx":224
+ *         _iouFun = _bbIou
+ *     else:
+ *         raise Exception('input data type not allowed.')             # <<<<<<<<<<<<<<
+ *     _iou = <double*> malloc(m*n* sizeof(double))
+ *     iou = np.zeros((m*n, ), dtype=np.double)
+ */
+  /*else*/ {
+    __pyx_t_5 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__14, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 224, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_Raise(__pyx_t_5, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __PYX_ERR(0, 224, __pyx_L1_error)
+  }
+  __pyx_L7:;
+
+  /* "pycocotools/_mask.pyx":225
+ *     else:
+ *         raise Exception('input data type not allowed.')
+ *     _iou = <double*> malloc(m*n* sizeof(double))             # <<<<<<<<<<<<<<
+ *     iou = np.zeros((m*n, ), dtype=np.double)
+ *     shape[0] = <np.npy_intp> m*n
+ */
+  __pyx_v__iou = ((double *)malloc(((__pyx_v_m * __pyx_v_n) * (sizeof(double)))));
+
+  /* "pycocotools/_mask.pyx":226
+ *         raise Exception('input data type not allowed.')
+ *     _iou = <double*> malloc(m*n* sizeof(double))
+ *     iou = np.zeros((m*n, ), dtype=np.double)             # <<<<<<<<<<<<<<
+ *     shape[0] = <np.npy_intp> m*n
+ *     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+ */
+  __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_zeros); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = __Pyx_PyInt_From_siz((__pyx_v_m * __pyx_v_n)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_5);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_5);
+  __pyx_t_5 = 0;
+  __pyx_t_5 = PyTuple_New(1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_1);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_double); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_4) < 0) __PYX_ERR(0, 226, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_5, __pyx_t_1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 226, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_iou = __pyx_t_4;
+  __pyx_t_4 = 0;
+
+  /* "pycocotools/_mask.pyx":227
+ *     _iou = <double*> malloc(m*n* sizeof(double))
+ *     iou = np.zeros((m*n, ), dtype=np.double)
+ *     shape[0] = <np.npy_intp> m*n             # <<<<<<<<<<<<<<
+ *     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+ *     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+ */
+  (__pyx_v_shape[0]) = (((npy_intp)__pyx_v_m) * __pyx_v_n);
+
+  /* "pycocotools/_mask.pyx":228
+ *     iou = np.zeros((m*n, ), dtype=np.double)
+ *     shape[0] = <np.npy_intp> m*n
+ *     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)             # <<<<<<<<<<<<<<
+ *     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+ *     _iouFun(dt, gt, iscrowd, m, n, iou)
+ */
+  __pyx_t_4 = PyArray_SimpleNewFromData(1, __pyx_v_shape, NPY_DOUBLE, __pyx_v__iou); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 228, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF_SET(__pyx_v_iou, __pyx_t_4);
+  __pyx_t_4 = 0;
+
+  /* "pycocotools/_mask.pyx":229
+ *     shape[0] = <np.npy_intp> m*n
+ *     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+ *     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)             # <<<<<<<<<<<<<<
+ *     _iouFun(dt, gt, iscrowd, m, n, iou)
+ *     return iou.reshape((m,n), order='F')
+ */
+  if (!(likely(((__pyx_v_iou) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_iou, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 229, __pyx_L1_error)
+  PyArray_ENABLEFLAGS(((PyArrayObject *)__pyx_v_iou), NPY_OWNDATA);
+
+  /* "pycocotools/_mask.pyx":230
+ *     iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+ *     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+ *     _iouFun(dt, gt, iscrowd, m, n, iou)             # <<<<<<<<<<<<<<
+ *     return iou.reshape((m,n), order='F')
+ * 
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_siz(__pyx_v_m); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 230, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_5 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 230, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_INCREF(__pyx_v__iouFun);
+  __pyx_t_3 = __pyx_v__iouFun; __pyx_t_2 = NULL;
+  __pyx_t_10 = 0;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
+    __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_3);
+    if (likely(__pyx_t_2)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_2);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_3, function);
+      __pyx_t_10 = 1;
+    }
+  }
+  #if CYTHON_FAST_PYCALL
+  if (PyFunction_Check(__pyx_t_3)) {
+    PyObject *__pyx_temp[7] = {__pyx_t_2, __pyx_v_dt, __pyx_v_gt, ((PyObject *)__pyx_v_iscrowd), __pyx_t_1, __pyx_t_5, __pyx_v_iou};
+    __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_10, 6+__pyx_t_10); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 230, __pyx_L1_error)
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  } else
+  #endif
+  #if CYTHON_FAST_PYCCALL
+  if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
+    PyObject *__pyx_temp[7] = {__pyx_t_2, __pyx_v_dt, __pyx_v_gt, ((PyObject *)__pyx_v_iscrowd), __pyx_t_1, __pyx_t_5, __pyx_v_iou};
+    __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_10, 6+__pyx_t_10); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 230, __pyx_L1_error)
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  } else
+  #endif
+  {
+    __pyx_t_11 = PyTuple_New(6+__pyx_t_10); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 230, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_11);
+    if (__pyx_t_2) {
+      __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_2); __pyx_t_2 = NULL;
+    }
+    __Pyx_INCREF(__pyx_v_dt);
+    __Pyx_GIVEREF(__pyx_v_dt);
+    PyTuple_SET_ITEM(__pyx_t_11, 0+__pyx_t_10, __pyx_v_dt);
+    __Pyx_INCREF(__pyx_v_gt);
+    __Pyx_GIVEREF(__pyx_v_gt);
+    PyTuple_SET_ITEM(__pyx_t_11, 1+__pyx_t_10, __pyx_v_gt);
+    __Pyx_INCREF(((PyObject *)__pyx_v_iscrowd));
+    __Pyx_GIVEREF(((PyObject *)__pyx_v_iscrowd));
+    PyTuple_SET_ITEM(__pyx_t_11, 2+__pyx_t_10, ((PyObject *)__pyx_v_iscrowd));
+    __Pyx_GIVEREF(__pyx_t_1);
+    PyTuple_SET_ITEM(__pyx_t_11, 3+__pyx_t_10, __pyx_t_1);
+    __Pyx_GIVEREF(__pyx_t_5);
+    PyTuple_SET_ITEM(__pyx_t_11, 4+__pyx_t_10, __pyx_t_5);
+    __Pyx_INCREF(__pyx_v_iou);
+    __Pyx_GIVEREF(__pyx_v_iou);
+    PyTuple_SET_ITEM(__pyx_t_11, 5+__pyx_t_10, __pyx_v_iou);
+    __pyx_t_1 = 0;
+    __pyx_t_5 = 0;
+    __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_11, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 230, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
+  }
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+
+  /* "pycocotools/_mask.pyx":231
+ *     PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+ *     _iouFun(dt, gt, iscrowd, m, n, iou)
+ *     return iou.reshape((m,n), order='F')             # <<<<<<<<<<<<<<
+ * 
+ * def toBbox( rleObjs ):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_iou, __pyx_n_s_reshape); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 231, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_3 = __Pyx_PyInt_From_siz(__pyx_v_m); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 231, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_11 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 231, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_11);
+  __pyx_t_5 = PyTuple_New(2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 231, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_GIVEREF(__pyx_t_3);
+  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_11);
+  PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_11);
+  __pyx_t_3 = 0;
+  __pyx_t_11 = 0;
+  __pyx_t_11 = PyTuple_New(1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 231, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_11);
+  __Pyx_GIVEREF(__pyx_t_5);
+  PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_5);
+  __pyx_t_5 = 0;
+  __pyx_t_5 = PyDict_New(); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 231, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  if (PyDict_SetItem(__pyx_t_5, __pyx_n_s_order, __pyx_n_s_F) < 0) __PYX_ERR(0, 231, __pyx_L1_error)
+  __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_11, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 231, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_r = __pyx_t_3;
+  __pyx_t_3 = 0;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":163
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):             # <<<<<<<<<<<<<<
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_11);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("pycocotools._mask.iou", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_iscrowd.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF(__pyx_v__preproc);
+  __Pyx_XDECREF(__pyx_v__rleIou);
+  __Pyx_XDECREF(__pyx_v__bbIou);
+  __Pyx_XDECREF(__pyx_v__len);
+  __Pyx_XDECREF((PyObject *)__pyx_v_iscrowd);
+  __Pyx_XDECREF(__pyx_v__iouFun);
+  __Pyx_XDECREF(__pyx_v_iou);
+  __Pyx_XDECREF(__pyx_v_dt);
+  __Pyx_XDECREF(__pyx_v_gt);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":233
+ *     return iou.reshape((m,n), order='F')
+ * 
+ * def toBbox( rleObjs ):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_15toBbox(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_15toBbox = {"toBbox", (PyCFunction)__pyx_pw_11pycocotools_5_mask_15toBbox, METH_O, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_15toBbox(PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("toBbox (wrapper)", 0);
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_14toBbox(__pyx_self, ((PyObject *)__pyx_v_rleObjs));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_14toBbox(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_rleObjs) {
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs = 0;
+  siz __pyx_v_n;
+  BB __pyx_v__bb;
+  npy_intp __pyx_v_shape[1];
+  PyObject *__pyx_v_bb = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  siz __pyx_t_5;
+  PyObject *__pyx_t_6 = NULL;
+  __Pyx_RefNannySetupContext("toBbox", 0);
+
+  /* "pycocotools/_mask.pyx":234
+ * 
+ * def toBbox( rleObjs ):
+ *     cdef RLEs Rs = _frString(rleObjs)             # <<<<<<<<<<<<<<
+ *     cdef siz n = Rs.n
+ *     cdef BB _bb = <BB> malloc(4*n* sizeof(double))
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_frString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 234, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_rleObjs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 234, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 234, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_rleObjs};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 234, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 234, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(__pyx_v_rleObjs);
+      __Pyx_GIVEREF(__pyx_v_rleObjs);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_rleObjs);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 234, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_11pycocotools_5_mask_RLEs))))) __PYX_ERR(0, 234, __pyx_L1_error)
+  __pyx_v_Rs = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":235
+ * def toBbox( rleObjs ):
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n             # <<<<<<<<<<<<<<
+ *     cdef BB _bb = <BB> malloc(4*n* sizeof(double))
+ *     rleToBbox( <const RLE*> Rs._R, _bb, n )
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_Rs), __pyx_n_s_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 235, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_5 = __Pyx_PyInt_As_siz(__pyx_t_1); if (unlikely((__pyx_t_5 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 235, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_n = __pyx_t_5;
+
+  /* "pycocotools/_mask.pyx":236
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n
+ *     cdef BB _bb = <BB> malloc(4*n* sizeof(double))             # <<<<<<<<<<<<<<
+ *     rleToBbox( <const RLE*> Rs._R, _bb, n )
+ *     cdef np.npy_intp shape[1]
+ */
+  __pyx_v__bb = ((BB)malloc(((4 * __pyx_v_n) * (sizeof(double)))));
+
+  /* "pycocotools/_mask.pyx":237
+ *     cdef siz n = Rs.n
+ *     cdef BB _bb = <BB> malloc(4*n* sizeof(double))
+ *     rleToBbox( <const RLE*> Rs._R, _bb, n )             # <<<<<<<<<<<<<<
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> 4*n
+ */
+  rleToBbox(((RLE const *)__pyx_v_Rs->_R), __pyx_v__bb, __pyx_v_n);
+
+  /* "pycocotools/_mask.pyx":239
+ *     rleToBbox( <const RLE*> Rs._R, _bb, n )
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> 4*n             # <<<<<<<<<<<<<<
+ *     bb = np.array((1,4*n), dtype=np.double)
+ *     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+ */
+  (__pyx_v_shape[0]) = (((npy_intp)4) * __pyx_v_n);
+
+  /* "pycocotools/_mask.pyx":240
+ *     cdef np.npy_intp shape[1]
+ *     shape[0] = <np.npy_intp> 4*n
+ *     bb = np.array((1,4*n), dtype=np.double)             # <<<<<<<<<<<<<<
+ *     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+ *     PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 240, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_array); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 240, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_siz((4 * __pyx_v_n)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 240, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 240, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_INCREF(__pyx_int_1);
+  __Pyx_GIVEREF(__pyx_int_1);
+  PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_int_1);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_1);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 240, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __pyx_t_4 = PyDict_New(); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 240, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 240, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_double); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 240, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  if (PyDict_SetItem(__pyx_t_4, __pyx_n_s_dtype, __pyx_t_6) < 0) __PYX_ERR(0, 240, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+  __pyx_t_6 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 240, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_v_bb = __pyx_t_6;
+  __pyx_t_6 = 0;
+
+  /* "pycocotools/_mask.pyx":241
+ *     shape[0] = <np.npy_intp> 4*n
+ *     bb = np.array((1,4*n), dtype=np.double)
+ *     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))             # <<<<<<<<<<<<<<
+ *     PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
+ *     return bb
+ */
+  __pyx_t_4 = PyArray_SimpleNewFromData(1, __pyx_v_shape, NPY_DOUBLE, __pyx_v__bb); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 241, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_reshape); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 241, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 241, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 241, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_4);
+  __Pyx_INCREF(__pyx_int_4);
+  __Pyx_GIVEREF(__pyx_int_4);
+  PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_int_4);
+  __pyx_t_4 = 0;
+  __pyx_t_4 = NULL;
+  if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_1))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_1);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_1, function);
+    }
+  }
+  if (!__pyx_t_4) {
+    __pyx_t_6 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 241, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_GOTREF(__pyx_t_6);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_2};
+      __pyx_t_6 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 241, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_2};
+      __pyx_t_6 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 241, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_3 = PyTuple_New(1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 241, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      __Pyx_GIVEREF(__pyx_t_2);
+      PyTuple_SET_ITEM(__pyx_t_3, 0+1, __pyx_t_2);
+      __pyx_t_2 = 0;
+      __pyx_t_6 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_3, NULL); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 241, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF_SET(__pyx_v_bb, __pyx_t_6);
+  __pyx_t_6 = 0;
+
+  /* "pycocotools/_mask.pyx":242
+ *     bb = np.array((1,4*n), dtype=np.double)
+ *     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+ *     PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)             # <<<<<<<<<<<<<<
+ *     return bb
+ * 
+ */
+  if (!(likely(((__pyx_v_bb) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_bb, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 242, __pyx_L1_error)
+  PyArray_ENABLEFLAGS(((PyArrayObject *)__pyx_v_bb), NPY_OWNDATA);
+
+  /* "pycocotools/_mask.pyx":243
+ *     bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+ *     PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
+ *     return bb             # <<<<<<<<<<<<<<
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_bb);
+  __pyx_r = __pyx_v_bb;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":233
+ *     return iou.reshape((m,n), order='F')
+ * 
+ * def toBbox( rleObjs ):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("pycocotools._mask.toBbox", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_bb);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":245
+ *     return bb
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_17frBbox(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_17frBbox = {"frBbox", (PyCFunction)__pyx_pw_11pycocotools_5_mask_17frBbox, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_17frBbox(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_bb = 0;
+  siz __pyx_v_h;
+  siz __pyx_v_w;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("frBbox (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_bb,&__pyx_n_s_h,&__pyx_n_s_w,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_bb)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frBbox", 1, 3, 3, 1); __PYX_ERR(0, 245, __pyx_L3_error)
+        }
+        case  2:
+        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frBbox", 1, 3, 3, 2); __PYX_ERR(0, 245, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "frBbox") < 0)) __PYX_ERR(0, 245, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_bb = ((PyArrayObject *)values[0]);
+    __pyx_v_h = __Pyx_PyInt_As_siz(values[1]); if (unlikely((__pyx_v_h == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 245, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_siz(values[2]); if (unlikely((__pyx_v_w == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 245, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("frBbox", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 245, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("pycocotools._mask.frBbox", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_bb), __pyx_ptype_5numpy_ndarray, 1, "bb", 0))) __PYX_ERR(0, 245, __pyx_L1_error)
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_16frBbox(__pyx_self, __pyx_v_bb, __pyx_v_h, __pyx_v_w);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_16frBbox(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_bb, siz __pyx_v_h, siz __pyx_v_w) {
+  siz __pyx_v_n;
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs = NULL;
+  PyObject *__pyx_v_objs = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_bb;
+  __Pyx_Buffer __pyx_pybuffer_bb;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  __Pyx_RefNannySetupContext("frBbox", 0);
+  __pyx_pybuffer_bb.pybuffer.buf = NULL;
+  __pyx_pybuffer_bb.refcount = 0;
+  __pyx_pybuffernd_bb.data = NULL;
+  __pyx_pybuffernd_bb.rcbuffer = &__pyx_pybuffer_bb;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_bb.rcbuffer->pybuffer, (PyObject*)__pyx_v_bb, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 245, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_bb.diminfo[0].strides = __pyx_pybuffernd_bb.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_bb.diminfo[0].shape = __pyx_pybuffernd_bb.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_bb.diminfo[1].strides = __pyx_pybuffernd_bb.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_bb.diminfo[1].shape = __pyx_pybuffernd_bb.rcbuffer->pybuffer.shape[1];
+
+  /* "pycocotools/_mask.pyx":246
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
+ *     cdef siz n = bb.shape[0]             # <<<<<<<<<<<<<<
+ *     Rs = RLEs(n)
+ *     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+ */
+  __pyx_v_n = (__pyx_v_bb->dimensions[0]);
+
+  /* "pycocotools/_mask.pyx":247
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)             # <<<<<<<<<<<<<<
+ *     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+ *     objs = _toString(Rs)
+ */
+  __pyx_t_1 = __Pyx_PyInt_From_siz(__pyx_v_n); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 247, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 247, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_1);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_11pycocotools_5_mask_RLEs), __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 247, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_v_Rs = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":248
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)
+ *     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )             # <<<<<<<<<<<<<<
+ *     objs = _toString(Rs)
+ *     return objs
+ */
+  rleFrBbox(((RLE *)__pyx_v_Rs->_R), ((BB const )__pyx_v_bb->data), __pyx_v_h, __pyx_v_w, __pyx_v_n);
+
+  /* "pycocotools/_mask.pyx":249
+ *     Rs = RLEs(n)
+ *     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+ *     objs = _toString(Rs)             # <<<<<<<<<<<<<<
+ *     return objs
+ * 
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_toString); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 249, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, ((PyObject *)__pyx_v_Rs)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 249, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 249, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 249, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 249, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(((PyObject *)__pyx_v_Rs));
+      __Pyx_GIVEREF(((PyObject *)__pyx_v_Rs));
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, ((PyObject *)__pyx_v_Rs));
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 249, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_v_objs = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":250
+ *     rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+ *     objs = _toString(Rs)
+ *     return objs             # <<<<<<<<<<<<<<
+ * 
+ * def frPoly( poly, siz h, siz w ):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":245
+ *     return bb
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_bb.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("pycocotools._mask.frBbox", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_bb.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":252
+ *     return objs
+ * 
+ * def frPoly( poly, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_19frPoly(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_19frPoly = {"frPoly", (PyCFunction)__pyx_pw_11pycocotools_5_mask_19frPoly, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_19frPoly(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_poly = 0;
+  siz __pyx_v_h;
+  siz __pyx_v_w;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("frPoly (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_poly,&__pyx_n_s_h,&__pyx_n_s_w,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_poly)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frPoly", 1, 3, 3, 1); __PYX_ERR(0, 252, __pyx_L3_error)
+        }
+        case  2:
+        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frPoly", 1, 3, 3, 2); __PYX_ERR(0, 252, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "frPoly") < 0)) __PYX_ERR(0, 252, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_poly = values[0];
+    __pyx_v_h = __Pyx_PyInt_As_siz(values[1]); if (unlikely((__pyx_v_h == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 252, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_siz(values[2]); if (unlikely((__pyx_v_w == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 252, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("frPoly", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 252, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("pycocotools._mask.frPoly", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_18frPoly(__pyx_self, __pyx_v_poly, __pyx_v_h, __pyx_v_w);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_18frPoly(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_poly, siz __pyx_v_h, siz __pyx_v_w) {
+  PyArrayObject *__pyx_v_np_poly = 0;
+  Py_ssize_t __pyx_v_n;
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs = NULL;
+  PyObject *__pyx_v_i = NULL;
+  PyObject *__pyx_v_p = NULL;
+  PyObject *__pyx_v_objs = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_np_poly;
+  __Pyx_Buffer __pyx_pybuffer_np_poly;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *(*__pyx_t_4)(PyObject *);
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  PyObject *__pyx_t_9 = NULL;
+  PyArrayObject *__pyx_t_10 = NULL;
+  int __pyx_t_11;
+  PyObject *__pyx_t_12 = NULL;
+  PyObject *__pyx_t_13 = NULL;
+  PyObject *__pyx_t_14 = NULL;
+  Py_ssize_t __pyx_t_15;
+  Py_ssize_t __pyx_t_16;
+  __Pyx_RefNannySetupContext("frPoly", 0);
+  __pyx_pybuffer_np_poly.pybuffer.buf = NULL;
+  __pyx_pybuffer_np_poly.refcount = 0;
+  __pyx_pybuffernd_np_poly.data = NULL;
+  __pyx_pybuffernd_np_poly.rcbuffer = &__pyx_pybuffer_np_poly;
+
+  /* "pycocotools/_mask.pyx":254
+ * def frPoly( poly, siz h, siz w ):
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)             # <<<<<<<<<<<<<<
+ *     Rs = RLEs(n)
+ *     for i, p in enumerate(poly):
+ */
+  __pyx_t_1 = PyObject_Length(__pyx_v_poly); if (unlikely(__pyx_t_1 == -1)) __PYX_ERR(0, 254, __pyx_L1_error)
+  __pyx_v_n = __pyx_t_1;
+
+  /* "pycocotools/_mask.pyx":255
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)
+ *     Rs = RLEs(n)             # <<<<<<<<<<<<<<
+ *     for i, p in enumerate(poly):
+ *         np_poly = np.array(p, dtype=np.double, order='F')
+ */
+  __pyx_t_2 = PyInt_FromSsize_t(__pyx_v_n); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 255, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 255, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_2);
+  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_2);
+  __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_11pycocotools_5_mask_RLEs), __pyx_t_3, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 255, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_Rs = ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "pycocotools/_mask.pyx":256
+ *     n = len(poly)
+ *     Rs = RLEs(n)
+ *     for i, p in enumerate(poly):             # <<<<<<<<<<<<<<
+ *         np_poly = np.array(p, dtype=np.double, order='F')
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, len(np_poly)/2, h, w )
+ */
+  __Pyx_INCREF(__pyx_int_0);
+  __pyx_t_2 = __pyx_int_0;
+  if (likely(PyList_CheckExact(__pyx_v_poly)) || PyTuple_CheckExact(__pyx_v_poly)) {
+    __pyx_t_3 = __pyx_v_poly; __Pyx_INCREF(__pyx_t_3); __pyx_t_1 = 0;
+    __pyx_t_4 = NULL;
+  } else {
+    __pyx_t_1 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_poly); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 256, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 256, __pyx_L1_error)
+  }
+  for (;;) {
+    if (likely(!__pyx_t_4)) {
+      if (likely(PyList_CheckExact(__pyx_t_3))) {
+        if (__pyx_t_1 >= PyList_GET_SIZE(__pyx_t_3)) break;
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 256, __pyx_L1_error)
+        #else
+        __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 256, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      } else {
+        if (__pyx_t_1 >= PyTuple_GET_SIZE(__pyx_t_3)) break;
+        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+        __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_1); __Pyx_INCREF(__pyx_t_5); __pyx_t_1++; if (unlikely(0 < 0)) __PYX_ERR(0, 256, __pyx_L1_error)
+        #else
+        __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_1); __pyx_t_1++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 256, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_5);
+        #endif
+      }
+    } else {
+      __pyx_t_5 = __pyx_t_4(__pyx_t_3);
+      if (unlikely(!__pyx_t_5)) {
+        PyObject* exc_type = PyErr_Occurred();
+        if (exc_type) {
+          if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
+          else __PYX_ERR(0, 256, __pyx_L1_error)
+        }
+        break;
+      }
+      __Pyx_GOTREF(__pyx_t_5);
+    }
+    __Pyx_XDECREF_SET(__pyx_v_p, __pyx_t_5);
+    __pyx_t_5 = 0;
+    __Pyx_INCREF(__pyx_t_2);
+    __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_2);
+    __pyx_t_5 = __Pyx_PyInt_AddObjC(__pyx_t_2, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 256, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_2);
+    __pyx_t_2 = __pyx_t_5;
+    __pyx_t_5 = 0;
+
+    /* "pycocotools/_mask.pyx":257
+ *     Rs = RLEs(n)
+ *     for i, p in enumerate(poly):
+ *         np_poly = np.array(p, dtype=np.double, order='F')             # <<<<<<<<<<<<<<
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, len(np_poly)/2, h, w )
+ *     objs = _toString(Rs)
+ */
+    __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 257, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_array); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 257, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_5 = PyTuple_New(1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 257, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_INCREF(__pyx_v_p);
+    __Pyx_GIVEREF(__pyx_v_p);
+    PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_v_p);
+    __pyx_t_7 = PyDict_New(); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 257, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_8 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 257, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_8);
+    __pyx_t_9 = __Pyx_PyObject_GetAttrStr(__pyx_t_8, __pyx_n_s_double); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 257, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_9);
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    if (PyDict_SetItem(__pyx_t_7, __pyx_n_s_dtype, __pyx_t_9) < 0) __PYX_ERR(0, 257, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
+    if (PyDict_SetItem(__pyx_t_7, __pyx_n_s_order, __pyx_n_s_F) < 0) __PYX_ERR(0, 257, __pyx_L1_error)
+    __pyx_t_9 = __Pyx_PyObject_Call(__pyx_t_6, __pyx_t_5, __pyx_t_7); if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 257, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_9);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    if (!(likely(((__pyx_t_9) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_9, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 257, __pyx_L1_error)
+    __pyx_t_10 = ((PyArrayObject *)__pyx_t_9);
+    {
+      __Pyx_BufFmt_StackElem __pyx_stack[1];
+      __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_np_poly.rcbuffer->pybuffer);
+      __pyx_t_11 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_np_poly.rcbuffer->pybuffer, (PyObject*)__pyx_t_10, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack);
+      if (unlikely(__pyx_t_11 < 0)) {
+        PyErr_Fetch(&__pyx_t_12, &__pyx_t_13, &__pyx_t_14);
+        if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_np_poly.rcbuffer->pybuffer, (PyObject*)__pyx_v_np_poly, &__Pyx_TypeInfo_nn___pyx_t_5numpy_double_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+          Py_XDECREF(__pyx_t_12); Py_XDECREF(__pyx_t_13); Py_XDECREF(__pyx_t_14);
+          __Pyx_RaiseBufferFallbackError();
+        } else {
+          PyErr_Restore(__pyx_t_12, __pyx_t_13, __pyx_t_14);
+        }
+      }
+      __pyx_pybuffernd_np_poly.diminfo[0].strides = __pyx_pybuffernd_np_poly.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_np_poly.diminfo[0].shape = __pyx_pybuffernd_np_poly.rcbuffer->pybuffer.shape[0];
+      if (unlikely(__pyx_t_11 < 0)) __PYX_ERR(0, 257, __pyx_L1_error)
+    }
+    __pyx_t_10 = 0;
+    __Pyx_XDECREF_SET(__pyx_v_np_poly, ((PyArrayObject *)__pyx_t_9));
+    __pyx_t_9 = 0;
+
+    /* "pycocotools/_mask.pyx":258
+ *     for i, p in enumerate(poly):
+ *         np_poly = np.array(p, dtype=np.double, order='F')
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, len(np_poly)/2, h, w )             # <<<<<<<<<<<<<<
+ *     objs = _toString(Rs)
+ *     return objs
+ */
+    __pyx_t_15 = __Pyx_PyIndex_AsSsize_t(__pyx_v_i); if (unlikely((__pyx_t_15 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(0, 258, __pyx_L1_error)
+    __pyx_t_16 = PyObject_Length(((PyObject *)__pyx_v_np_poly)); if (unlikely(__pyx_t_16 == -1)) __PYX_ERR(0, 258, __pyx_L1_error)
+    rleFrPoly(((RLE *)(&(__pyx_v_Rs->_R[__pyx_t_15]))), ((double const *)__pyx_v_np_poly->data), __Pyx_div_Py_ssize_t(__pyx_t_16, 2), __pyx_v_h, __pyx_v_w);
+
+    /* "pycocotools/_mask.pyx":256
+ *     n = len(poly)
+ *     Rs = RLEs(n)
+ *     for i, p in enumerate(poly):             # <<<<<<<<<<<<<<
+ *         np_poly = np.array(p, dtype=np.double, order='F')
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, len(np_poly)/2, h, w )
+ */
+  }
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "pycocotools/_mask.pyx":259
+ *         np_poly = np.array(p, dtype=np.double, order='F')
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, len(np_poly)/2, h, w )
+ *     objs = _toString(Rs)             # <<<<<<<<<<<<<<
+ *     return objs
+ * 
+ */
+  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_toString); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 259, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_9 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
+    __pyx_t_9 = PyMethod_GET_SELF(__pyx_t_3);
+    if (likely(__pyx_t_9)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_9);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_3, function);
+    }
+  }
+  if (!__pyx_t_9) {
+    __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, ((PyObject *)__pyx_v_Rs)); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 259, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_9, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_2 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 259, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __Pyx_GOTREF(__pyx_t_2);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_9, ((PyObject *)__pyx_v_Rs)};
+      __pyx_t_2 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 259, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_9); __pyx_t_9 = 0;
+      __Pyx_GOTREF(__pyx_t_2);
+    } else
+    #endif
+    {
+      __pyx_t_7 = PyTuple_New(1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 259, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      __Pyx_GIVEREF(__pyx_t_9); PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_9); __pyx_t_9 = NULL;
+      __Pyx_INCREF(((PyObject *)__pyx_v_Rs));
+      __Pyx_GIVEREF(((PyObject *)__pyx_v_Rs));
+      PyTuple_SET_ITEM(__pyx_t_7, 0+1, ((PyObject *)__pyx_v_Rs));
+      __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_7, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 259, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_2);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_v_objs = __pyx_t_2;
+  __pyx_t_2 = 0;
+
+  /* "pycocotools/_mask.pyx":260
+ *         rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, len(np_poly)/2, h, w )
+ *     objs = _toString(Rs)
+ *     return objs             # <<<<<<<<<<<<<<
+ * 
+ * def frUncompressedRLE(ucRles, siz h, siz w):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":252
+ *     return objs
+ * 
+ * def frPoly( poly, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_XDECREF(__pyx_t_9);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_np_poly.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("pycocotools._mask.frPoly", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_np_poly.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_np_poly);
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XDECREF(__pyx_v_i);
+  __Pyx_XDECREF(__pyx_v_p);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":262
+ *     return objs
+ * 
+ * def frUncompressedRLE(ucRles, siz h, siz w):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.uint32_t, ndim=1] cnts
+ *     cdef RLE R
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_21frUncompressedRLE(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_21frUncompressedRLE = {"frUncompressedRLE", (PyCFunction)__pyx_pw_11pycocotools_5_mask_21frUncompressedRLE, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_21frUncompressedRLE(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_ucRles = 0;
+  CYTHON_UNUSED siz __pyx_v_h;
+  CYTHON_UNUSED siz __pyx_v_w;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("frUncompressedRLE (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_ucRles,&__pyx_n_s_h,&__pyx_n_s_w,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_ucRles)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frUncompressedRLE", 1, 3, 3, 1); __PYX_ERR(0, 262, __pyx_L3_error)
+        }
+        case  2:
+        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frUncompressedRLE", 1, 3, 3, 2); __PYX_ERR(0, 262, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "frUncompressedRLE") < 0)) __PYX_ERR(0, 262, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_ucRles = values[0];
+    __pyx_v_h = __Pyx_PyInt_As_siz(values[1]); if (unlikely((__pyx_v_h == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 262, __pyx_L3_error)
+    __pyx_v_w = __Pyx_PyInt_As_siz(values[2]); if (unlikely((__pyx_v_w == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 262, __pyx_L3_error)
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("frUncompressedRLE", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 262, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("pycocotools._mask.frUncompressedRLE", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_20frUncompressedRLE(__pyx_self, __pyx_v_ucRles, __pyx_v_h, __pyx_v_w);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_20frUncompressedRLE(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_ucRles, CYTHON_UNUSED siz __pyx_v_h, CYTHON_UNUSED siz __pyx_v_w) {
+  PyArrayObject *__pyx_v_cnts = 0;
+  RLE __pyx_v_R;
+  uint *__pyx_v_data;
+  Py_ssize_t __pyx_v_n;
+  PyObject *__pyx_v_objs = NULL;
+  Py_ssize_t __pyx_v_i;
+  struct __pyx_obj_11pycocotools_5_mask_RLEs *__pyx_v_Rs = NULL;
+  Py_ssize_t __pyx_v_j;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_cnts;
+  __Pyx_Buffer __pyx_pybuffer_cnts;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  Py_ssize_t __pyx_t_1;
+  PyObject *__pyx_t_2 = NULL;
+  Py_ssize_t __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyArrayObject *__pyx_t_8 = NULL;
+  int __pyx_t_9;
+  PyObject *__pyx_t_10 = NULL;
+  PyObject *__pyx_t_11 = NULL;
+  PyObject *__pyx_t_12 = NULL;
+  Py_ssize_t __pyx_t_13;
+  Py_ssize_t __pyx_t_14;
+  Py_ssize_t __pyx_t_15;
+  RLE __pyx_t_16;
+  siz __pyx_t_17;
+  int __pyx_t_18;
+  __Pyx_RefNannySetupContext("frUncompressedRLE", 0);
+  __pyx_pybuffer_cnts.pybuffer.buf = NULL;
+  __pyx_pybuffer_cnts.refcount = 0;
+  __pyx_pybuffernd_cnts.data = NULL;
+  __pyx_pybuffernd_cnts.rcbuffer = &__pyx_pybuffer_cnts;
+
+  /* "pycocotools/_mask.pyx":266
+ *     cdef RLE R
+ *     cdef uint *data
+ *     n = len(ucRles)             # <<<<<<<<<<<<<<
+ *     objs = []
+ *     for i in range(n):
+ */
+  __pyx_t_1 = PyObject_Length(__pyx_v_ucRles); if (unlikely(__pyx_t_1 == -1)) __PYX_ERR(0, 266, __pyx_L1_error)
+  __pyx_v_n = __pyx_t_1;
+
+  /* "pycocotools/_mask.pyx":267
+ *     cdef uint *data
+ *     n = len(ucRles)
+ *     objs = []             # <<<<<<<<<<<<<<
+ *     for i in range(n):
+ *         Rs = RLEs(1)
+ */
+  __pyx_t_2 = PyList_New(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 267, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_v_objs = ((PyObject*)__pyx_t_2);
+  __pyx_t_2 = 0;
+
+  /* "pycocotools/_mask.pyx":268
+ *     n = len(ucRles)
+ *     objs = []
+ *     for i in range(n):             # <<<<<<<<<<<<<<
+ *         Rs = RLEs(1)
+ *         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+ */
+  __pyx_t_1 = __pyx_v_n;
+  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_1; __pyx_t_3+=1) {
+    __pyx_v_i = __pyx_t_3;
+
+    /* "pycocotools/_mask.pyx":269
+ *     objs = []
+ *     for i in range(n):
+ *         Rs = RLEs(1)             # <<<<<<<<<<<<<<
+ *         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+ *         # time for malloc can be saved here but it's fine
+ */
+    __pyx_t_2 = __Pyx_PyObject_Call(((PyObject *)__pyx_ptype_11pycocotools_5_mask_RLEs), __pyx_tuple__15, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 269, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_XDECREF_SET(__pyx_v_Rs, ((struct __pyx_obj_11pycocotools_5_mask_RLEs *)__pyx_t_2));
+    __pyx_t_2 = 0;
+
+    /* "pycocotools/_mask.pyx":270
+ *     for i in range(n):
+ *         Rs = RLEs(1)
+ *         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)             # <<<<<<<<<<<<<<
+ *         # time for malloc can be saved here but it's fine
+ *         data = <uint*> malloc(len(cnts)* sizeof(uint))
+ */
+    __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 270, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_array); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 270, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __pyx_t_2 = __Pyx_GetItemInt(__pyx_v_ucRles, __pyx_v_i, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 270, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __pyx_t_5 = PyObject_GetItem(__pyx_t_2, __pyx_n_s_counts); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 270, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 270, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_2);
+    __Pyx_GIVEREF(__pyx_t_5);
+    PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_5);
+    __pyx_t_5 = 0;
+    __pyx_t_5 = PyDict_New(); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 270, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 270, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_6);
+    __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_6, __pyx_n_s_uint32); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 270, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    if (PyDict_SetItem(__pyx_t_5, __pyx_n_s_dtype, __pyx_t_7) < 0) __PYX_ERR(0, 270, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_2, __pyx_t_5); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 270, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    if (!(likely(((__pyx_t_7) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_7, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 270, __pyx_L1_error)
+    __pyx_t_8 = ((PyArrayObject *)__pyx_t_7);
+    {
+      __Pyx_BufFmt_StackElem __pyx_stack[1];
+      __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_cnts.rcbuffer->pybuffer);
+      __pyx_t_9 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_cnts.rcbuffer->pybuffer, (PyObject*)__pyx_t_8, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack);
+      if (unlikely(__pyx_t_9 < 0)) {
+        PyErr_Fetch(&__pyx_t_10, &__pyx_t_11, &__pyx_t_12);
+        if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_cnts.rcbuffer->pybuffer, (PyObject*)__pyx_v_cnts, &__Pyx_TypeInfo_nn___pyx_t_5numpy_uint32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+          Py_XDECREF(__pyx_t_10); Py_XDECREF(__pyx_t_11); Py_XDECREF(__pyx_t_12);
+          __Pyx_RaiseBufferFallbackError();
+        } else {
+          PyErr_Restore(__pyx_t_10, __pyx_t_11, __pyx_t_12);
+        }
+      }
+      __pyx_pybuffernd_cnts.diminfo[0].strides = __pyx_pybuffernd_cnts.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_cnts.diminfo[0].shape = __pyx_pybuffernd_cnts.rcbuffer->pybuffer.shape[0];
+      if (unlikely(__pyx_t_9 < 0)) __PYX_ERR(0, 270, __pyx_L1_error)
+    }
+    __pyx_t_8 = 0;
+    __Pyx_XDECREF_SET(__pyx_v_cnts, ((PyArrayObject *)__pyx_t_7));
+    __pyx_t_7 = 0;
+
+    /* "pycocotools/_mask.pyx":272
+ *         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+ *         # time for malloc can be saved here but it's fine
+ *         data = <uint*> malloc(len(cnts)* sizeof(uint))             # <<<<<<<<<<<<<<
+ *         for j in range(len(cnts)):
+ *             data[j] = <uint> cnts[j]
+ */
+    __pyx_t_13 = PyObject_Length(((PyObject *)__pyx_v_cnts)); if (unlikely(__pyx_t_13 == -1)) __PYX_ERR(0, 272, __pyx_L1_error)
+    __pyx_v_data = ((uint *)malloc((__pyx_t_13 * (sizeof(unsigned int)))));
+
+    /* "pycocotools/_mask.pyx":273
+ *         # time for malloc can be saved here but it's fine
+ *         data = <uint*> malloc(len(cnts)* sizeof(uint))
+ *         for j in range(len(cnts)):             # <<<<<<<<<<<<<<
+ *             data[j] = <uint> cnts[j]
+ *         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+ */
+    __pyx_t_13 = PyObject_Length(((PyObject *)__pyx_v_cnts)); if (unlikely(__pyx_t_13 == -1)) __PYX_ERR(0, 273, __pyx_L1_error)
+    for (__pyx_t_14 = 0; __pyx_t_14 < __pyx_t_13; __pyx_t_14+=1) {
+      __pyx_v_j = __pyx_t_14;
+
+      /* "pycocotools/_mask.pyx":274
+ *         data = <uint*> malloc(len(cnts)* sizeof(uint))
+ *         for j in range(len(cnts)):
+ *             data[j] = <uint> cnts[j]             # <<<<<<<<<<<<<<
+ *         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+ *         Rs._R[0] = R
+ */
+      __pyx_t_15 = __pyx_v_j;
+      __pyx_t_9 = -1;
+      if (__pyx_t_15 < 0) {
+        __pyx_t_15 += __pyx_pybuffernd_cnts.diminfo[0].shape;
+        if (unlikely(__pyx_t_15 < 0)) __pyx_t_9 = 0;
+      } else if (unlikely(__pyx_t_15 >= __pyx_pybuffernd_cnts.diminfo[0].shape)) __pyx_t_9 = 0;
+      if (unlikely(__pyx_t_9 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_9);
+        __PYX_ERR(0, 274, __pyx_L1_error)
+      }
+      (__pyx_v_data[__pyx_v_j]) = ((uint)(*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_uint32_t *, __pyx_pybuffernd_cnts.rcbuffer->pybuffer.buf, __pyx_t_15, __pyx_pybuffernd_cnts.diminfo[0].strides)));
+    }
+
+    /* "pycocotools/_mask.pyx":275
+ *         for j in range(len(cnts)):
+ *             data[j] = <uint> cnts[j]
+ *         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)             # <<<<<<<<<<<<<<
+ *         Rs._R[0] = R
+ *         objs.append(_toString(Rs)[0])
+ */
+    __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_ucRles, __pyx_v_i, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_5 = PyObject_GetItem(__pyx_t_7, __pyx_n_s_size); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_5, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_17 = __Pyx_PyInt_As_siz(__pyx_t_7); if (unlikely((__pyx_t_17 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 275, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_16.h = __pyx_t_17;
+    __pyx_t_7 = __Pyx_GetItemInt(__pyx_v_ucRles, __pyx_v_i, Py_ssize_t, 1, PyInt_FromSsize_t, 0, 1, 1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_5 = PyObject_GetItem(__pyx_t_7, __pyx_n_s_size); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_7 = __Pyx_GetItemInt(__pyx_t_5, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 275, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_17 = __Pyx_PyInt_As_siz(__pyx_t_7); if (unlikely((__pyx_t_17 == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 275, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_16.w = __pyx_t_17;
+    __pyx_t_13 = PyObject_Length(((PyObject *)__pyx_v_cnts)); if (unlikely(__pyx_t_13 == -1)) __PYX_ERR(0, 275, __pyx_L1_error)
+    __pyx_t_16.m = __pyx_t_13;
+    __pyx_t_16.cnts = ((uint *)__pyx_v_data);
+    __pyx_v_R = __pyx_t_16;
+
+    /* "pycocotools/_mask.pyx":276
+ *             data[j] = <uint> cnts[j]
+ *         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+ *         Rs._R[0] = R             # <<<<<<<<<<<<<<
+ *         objs.append(_toString(Rs)[0])
+ *     return objs
+ */
+    (__pyx_v_Rs->_R[0]) = __pyx_v_R;
+
+    /* "pycocotools/_mask.pyx":277
+ *         R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+ *         Rs._R[0] = R
+ *         objs.append(_toString(Rs)[0])             # <<<<<<<<<<<<<<
+ *     return objs
+ * 
+ */
+    __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_toString); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 277, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_2 = NULL;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_5))) {
+      __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_5);
+      if (likely(__pyx_t_2)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5);
+        __Pyx_INCREF(__pyx_t_2);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_5, function);
+      }
+    }
+    if (!__pyx_t_2) {
+      __pyx_t_7 = __Pyx_PyObject_CallOneArg(__pyx_t_5, ((PyObject *)__pyx_v_Rs)); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 277, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+    } else {
+      #if CYTHON_FAST_PYCALL
+      if (PyFunction_Check(__pyx_t_5)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_2, ((PyObject *)__pyx_v_Rs)};
+        __pyx_t_7 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 277, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __Pyx_GOTREF(__pyx_t_7);
+      } else
+      #endif
+      #if CYTHON_FAST_PYCCALL
+      if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) {
+        PyObject *__pyx_temp[2] = {__pyx_t_2, ((PyObject *)__pyx_v_Rs)};
+        __pyx_t_7 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 277, __pyx_L1_error)
+        __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+        __Pyx_GOTREF(__pyx_t_7);
+      } else
+      #endif
+      {
+        __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 277, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_2); __pyx_t_2 = NULL;
+        __Pyx_INCREF(((PyObject *)__pyx_v_Rs));
+        __Pyx_GIVEREF(((PyObject *)__pyx_v_Rs));
+        PyTuple_SET_ITEM(__pyx_t_4, 0+1, ((PyObject *)__pyx_v_Rs));
+        __pyx_t_7 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_4, NULL); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 277, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_7);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      }
+    }
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    __pyx_t_5 = __Pyx_GetItemInt(__pyx_t_7, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 277, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __pyx_t_18 = __Pyx_PyList_Append(__pyx_v_objs, __pyx_t_5); if (unlikely(__pyx_t_18 == -1)) __PYX_ERR(0, 277, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  }
+
+  /* "pycocotools/_mask.pyx":278
+ *         Rs._R[0] = R
+ *         objs.append(_toString(Rs)[0])
+ *     return objs             # <<<<<<<<<<<<<<
+ * 
+ * def frPyObjects(pyobj, siz h, w):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":262
+ *     return objs
+ * 
+ * def frUncompressedRLE(ucRles, siz h, siz w):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.uint32_t, ndim=1] cnts
+ *     cdef RLE R
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_cnts.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("pycocotools._mask.frUncompressedRLE", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_cnts.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_cnts);
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XDECREF((PyObject *)__pyx_v_Rs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "pycocotools/_mask.pyx":280
+ *     return objs
+ * 
+ * def frPyObjects(pyobj, siz h, w):             # <<<<<<<<<<<<<<
+ *     if type(pyobj) == np.ndarray:
+ *         objs = frBbox(pyobj, h, w )
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11pycocotools_5_mask_23frPyObjects(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_11pycocotools_5_mask_23frPyObjects = {"frPyObjects", (PyCFunction)__pyx_pw_11pycocotools_5_mask_23frPyObjects, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_11pycocotools_5_mask_23frPyObjects(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_pyobj = 0;
+  siz __pyx_v_h;
+  PyObject *__pyx_v_w = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("frPyObjects (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_pyobj,&__pyx_n_s_h,&__pyx_n_s_w,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_pyobj)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_h)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frPyObjects", 1, 3, 3, 1); __PYX_ERR(0, 280, __pyx_L3_error)
+        }
+        case  2:
+        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_w)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("frPyObjects", 1, 3, 3, 2); __PYX_ERR(0, 280, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "frPyObjects") < 0)) __PYX_ERR(0, 280, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+    }
+    __pyx_v_pyobj = values[0];
+    __pyx_v_h = __Pyx_PyInt_As_siz(values[1]); if (unlikely((__pyx_v_h == ((siz)-1)) && PyErr_Occurred())) __PYX_ERR(0, 280, __pyx_L3_error)
+    __pyx_v_w = values[2];
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("frPyObjects", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 280, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("pycocotools._mask.frPyObjects", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_11pycocotools_5_mask_22frPyObjects(__pyx_self, __pyx_v_pyobj, __pyx_v_h, __pyx_v_w);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11pycocotools_5_mask_22frPyObjects(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_pyobj, siz __pyx_v_h, PyObject *__pyx_v_w) {
+  PyObject *__pyx_v_objs = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  int __pyx_t_6;
+  PyObject *__pyx_t_7 = NULL;
+  int __pyx_t_8;
+  Py_ssize_t __pyx_t_9;
+  __Pyx_RefNannySetupContext("frPyObjects", 0);
+
+  /* "pycocotools/_mask.pyx":281
+ * 
+ * def frPyObjects(pyobj, siz h, w):
+ *     if type(pyobj) == np.ndarray:             # <<<<<<<<<<<<<<
+ *         objs = frBbox(pyobj, h, w )
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)__pyx_ptype_5numpy_ndarray), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 281, __pyx_L1_error)
+  __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 281, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_2) {
+
+    /* "pycocotools/_mask.pyx":282
+ * def frPyObjects(pyobj, siz h, w):
+ *     if type(pyobj) == np.ndarray:
+ *         objs = frBbox(pyobj, h, w )             # <<<<<<<<<<<<<<
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ *         objs = frBbox(pyobj, h, w )
+ */
+    __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_frBbox); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 282, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_4 = __Pyx_PyInt_From_siz(__pyx_v_h); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 282, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_5 = NULL;
+    __pyx_t_6 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
+      __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_3);
+      if (likely(__pyx_t_5)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+        __Pyx_INCREF(__pyx_t_5);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_3, function);
+        __pyx_t_6 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_5, __pyx_v_pyobj, __pyx_t_4, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_6, 3+__pyx_t_6); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 282, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_5, __pyx_v_pyobj, __pyx_t_4, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_6, 3+__pyx_t_6); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 282, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_7 = PyTuple_New(3+__pyx_t_6); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 282, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__pyx_t_5) {
+        __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_5); __pyx_t_5 = NULL;
+      }
+      __Pyx_INCREF(__pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_v_pyobj);
+      PyTuple_SET_ITEM(__pyx_t_7, 0+__pyx_t_6, __pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_t_4);
+      PyTuple_SET_ITEM(__pyx_t_7, 1+__pyx_t_6, __pyx_t_4);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_7, 2+__pyx_t_6, __pyx_v_w);
+      __pyx_t_4 = 0;
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_7, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 282, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_v_objs = __pyx_t_1;
+    __pyx_t_1 = 0;
+
+    /* "pycocotools/_mask.pyx":281
+ * 
+ * def frPyObjects(pyobj, siz h, w):
+ *     if type(pyobj) == np.ndarray:             # <<<<<<<<<<<<<<
+ *         objs = frBbox(pyobj, h, w )
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "pycocotools/_mask.pyx":283
+ *     if type(pyobj) == np.ndarray:
+ *         objs = frBbox(pyobj, h, w )
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:             # <<<<<<<<<<<<<<
+ *         objs = frBbox(pyobj, h, w )
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 283, __pyx_L1_error)
+  __pyx_t_8 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 283, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_8) {
+  } else {
+    __pyx_t_2 = __pyx_t_8;
+    goto __pyx_L4_bool_binop_done;
+  }
+  __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_pyobj, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 283, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_9 = PyObject_Length(__pyx_t_1); if (unlikely(__pyx_t_9 == -1)) __PYX_ERR(0, 283, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_8 = ((__pyx_t_9 == 4) != 0);
+  __pyx_t_2 = __pyx_t_8;
+  __pyx_L4_bool_binop_done:;
+  if (__pyx_t_2) {
+
+    /* "pycocotools/_mask.pyx":284
+ *         objs = frBbox(pyobj, h, w )
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ *         objs = frBbox(pyobj, h, w )             # <<<<<<<<<<<<<<
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ *         objs = frPoly(pyobj, h, w )
+ */
+    __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_frBbox); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 284, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_7 = __Pyx_PyInt_From_siz(__pyx_v_h); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 284, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_7);
+    __pyx_t_4 = NULL;
+    __pyx_t_6 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
+      __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3);
+      if (likely(__pyx_t_4)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+        __Pyx_INCREF(__pyx_t_4);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_3, function);
+        __pyx_t_6 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_4, __pyx_v_pyobj, __pyx_t_7, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_6, 3+__pyx_t_6); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 284, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_4, __pyx_v_pyobj, __pyx_t_7, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_6, 3+__pyx_t_6); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 284, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_5 = PyTuple_New(3+__pyx_t_6); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 284, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      if (__pyx_t_4) {
+        __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      }
+      __Pyx_INCREF(__pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_v_pyobj);
+      PyTuple_SET_ITEM(__pyx_t_5, 0+__pyx_t_6, __pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_t_7);
+      PyTuple_SET_ITEM(__pyx_t_5, 1+__pyx_t_6, __pyx_t_7);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_5, 2+__pyx_t_6, __pyx_v_w);
+      __pyx_t_7 = 0;
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 284, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_v_objs = __pyx_t_1;
+    __pyx_t_1 = 0;
+
+    /* "pycocotools/_mask.pyx":283
+ *     if type(pyobj) == np.ndarray:
+ *         objs = frBbox(pyobj, h, w )
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:             # <<<<<<<<<<<<<<
+ *         objs = frBbox(pyobj, h, w )
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "pycocotools/_mask.pyx":285
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ *         objs = frBbox(pyobj, h, w )
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:             # <<<<<<<<<<<<<<
+ *         objs = frPoly(pyobj, h, w )
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict:
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 285, __pyx_L1_error)
+  __pyx_t_8 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 285, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_8) {
+  } else {
+    __pyx_t_2 = __pyx_t_8;
+    goto __pyx_L6_bool_binop_done;
+  }
+  __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_pyobj, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 285, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_9 = PyObject_Length(__pyx_t_1); if (unlikely(__pyx_t_9 == -1)) __PYX_ERR(0, 285, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_8 = ((__pyx_t_9 > 4) != 0);
+  __pyx_t_2 = __pyx_t_8;
+  __pyx_L6_bool_binop_done:;
+  if (__pyx_t_2) {
+
+    /* "pycocotools/_mask.pyx":286
+ *         objs = frBbox(pyobj, h, w )
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ *         objs = frPoly(pyobj, h, w )             # <<<<<<<<<<<<<<
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict:
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ */
+    __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_frPoly); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 286, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __pyx_t_5 = __Pyx_PyInt_From_siz(__pyx_v_h); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 286, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_5);
+    __pyx_t_7 = NULL;
+    __pyx_t_6 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) {
+      __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_3);
+      if (likely(__pyx_t_7)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3);
+        __Pyx_INCREF(__pyx_t_7);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_3, function);
+        __pyx_t_6 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_7, __pyx_v_pyobj, __pyx_t_5, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_6, 3+__pyx_t_6); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 286, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_7, __pyx_v_pyobj, __pyx_t_5, __pyx_v_w};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_6, 3+__pyx_t_6); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 286, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(3+__pyx_t_6); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 286, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      if (__pyx_t_7) {
+        __Pyx_GIVEREF(__pyx_t_7); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_7); __pyx_t_7 = NULL;
+      }
+      __Pyx_INCREF(__pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_v_pyobj);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+__pyx_t_6, __pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_t_5);
+      PyTuple_SET_ITEM(__pyx_t_4, 1+__pyx_t_6, __pyx_t_5);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_4, 2+__pyx_t_6, __pyx_v_w);
+      __pyx_t_5 = 0;
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 286, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_v_objs = __pyx_t_1;
+    __pyx_t_1 = 0;
+
+    /* "pycocotools/_mask.pyx":285
+ *     elif type(pyobj) == list and len(pyobj[0]) == 4:
+ *         objs = frBbox(pyobj, h, w )
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:             # <<<<<<<<<<<<<<
+ *         objs = frPoly(pyobj, h, w )
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "pycocotools/_mask.pyx":287
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ *         objs = frPoly(pyobj, h, w )
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict:             # <<<<<<<<<<<<<<
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ *     else:
+ */
+  __pyx_t_1 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_v_pyobj)), ((PyObject *)(&PyList_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 287, __pyx_L1_error)
+  __pyx_t_8 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 287, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (__pyx_t_8) {
+  } else {
+    __pyx_t_2 = __pyx_t_8;
+    goto __pyx_L8_bool_binop_done;
+  }
+  __pyx_t_1 = __Pyx_GetItemInt(__pyx_v_pyobj, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 287, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = PyObject_RichCompare(((PyObject *)Py_TYPE(__pyx_t_1)), ((PyObject *)(&PyDict_Type)), Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 287, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_8 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_8 < 0)) __PYX_ERR(0, 287, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_t_2 = __pyx_t_8;
+  __pyx_L8_bool_binop_done:;
+  if (__pyx_t_2) {
+
+    /* "pycocotools/_mask.pyx":288
+ *         objs = frPoly(pyobj, h, w )
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict:
+ *         objs = frUncompressedRLE(pyobj, h, w)             # <<<<<<<<<<<<<<
+ *     else:
+ *         raise Exception('input type is not supported.')
+ */
+    __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_frUncompressedRLE); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 288, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_4 = __Pyx_PyInt_From_siz(__pyx_v_h); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 288, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_5 = NULL;
+    __pyx_t_6 = 0;
+    if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_1))) {
+      __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_1);
+      if (likely(__pyx_t_5)) {
+        PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1);
+        __Pyx_INCREF(__pyx_t_5);
+        __Pyx_INCREF(function);
+        __Pyx_DECREF_SET(__pyx_t_1, function);
+        __pyx_t_6 = 1;
+      }
+    }
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_5, __pyx_v_pyobj, __pyx_t_4, __pyx_v_w};
+      __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_6, 3+__pyx_t_6); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 288, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) {
+      PyObject *__pyx_temp[4] = {__pyx_t_5, __pyx_v_pyobj, __pyx_t_4, __pyx_v_w};
+      __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_6, 3+__pyx_t_6); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 288, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_7 = PyTuple_New(3+__pyx_t_6); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 288, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_7);
+      if (__pyx_t_5) {
+        __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_5); __pyx_t_5 = NULL;
+      }
+      __Pyx_INCREF(__pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_v_pyobj);
+      PyTuple_SET_ITEM(__pyx_t_7, 0+__pyx_t_6, __pyx_v_pyobj);
+      __Pyx_GIVEREF(__pyx_t_4);
+      PyTuple_SET_ITEM(__pyx_t_7, 1+__pyx_t_6, __pyx_t_4);
+      __Pyx_INCREF(__pyx_v_w);
+      __Pyx_GIVEREF(__pyx_v_w);
+      PyTuple_SET_ITEM(__pyx_t_7, 2+__pyx_t_6, __pyx_v_w);
+      __pyx_t_4 = 0;
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_7, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 288, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    }
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_v_objs = __pyx_t_3;
+    __pyx_t_3 = 0;
+
+    /* "pycocotools/_mask.pyx":287
+ *     elif type(pyobj) == list and len(pyobj[0]) > 4:
+ *         objs = frPoly(pyobj, h, w )
+ *     elif type(pyobj) == list and type(pyobj[0]) == dict:             # <<<<<<<<<<<<<<
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ *     else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "pycocotools/_mask.pyx":290
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ *     else:
+ *         raise Exception('input type is not supported.')             # <<<<<<<<<<<<<<
+ *     return objs
+ */
+  /*else*/ {
+    __pyx_t_3 = __Pyx_PyObject_Call(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])), __pyx_tuple__16, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 290, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __PYX_ERR(0, 290, __pyx_L1_error)
+  }
+  __pyx_L3:;
+
+  /* "pycocotools/_mask.pyx":291
+ *     else:
+ *         raise Exception('input type is not supported.')
+ *     return objs             # <<<<<<<<<<<<<<
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_objs);
+  __pyx_r = __pyx_v_objs;
+  goto __pyx_L0;
+
+  /* "pycocotools/_mask.pyx":280
+ *     return objs
+ * 
+ * def frPyObjects(pyobj, siz h, w):             # <<<<<<<<<<<<<<
+ *     if type(pyobj) == np.ndarray:
+ *         objs = frBbox(pyobj, h, w )
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_AddTraceback("pycocotools._mask.frPyObjects", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_objs);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":197
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fullfill the PEP.
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5numpy_7ndarray___getbuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_v_copy_shape;
+  int __pyx_v_i;
+  int __pyx_v_ndim;
+  int __pyx_v_endian_detector;
+  int __pyx_v_little_endian;
+  int __pyx_v_t;
+  char *__pyx_v_f;
+  PyArray_Descr *__pyx_v_descr = 0;
+  int __pyx_v_offset;
+  int __pyx_v_hasfields;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  int __pyx_t_5;
+  PyObject *__pyx_t_6 = NULL;
+  char *__pyx_t_7;
+  __Pyx_RefNannySetupContext("__getbuffer__", 0);
+  if (__pyx_v_info != NULL) {
+    __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+    __Pyx_GIVEREF(__pyx_v_info->obj);
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":203
+ *             # of flags
+ * 
+ *             if info == NULL: return             # <<<<<<<<<<<<<<
+ * 
+ *             cdef int copy_shape, i, ndim
+ */
+  __pyx_t_1 = ((__pyx_v_info == NULL) != 0);
+  if (__pyx_t_1) {
+    __pyx_r = 0;
+    goto __pyx_L0;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":206
+ * 
+ *             cdef int copy_shape, i, ndim
+ *             cdef int endian_detector = 1             # <<<<<<<<<<<<<<
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ * 
+ */
+  __pyx_v_endian_detector = 1;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":207
+ *             cdef int copy_shape, i, ndim
+ *             cdef int endian_detector = 1
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
+ * 
+ *             ndim = PyArray_NDIM(self)
+ */
+  __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":209
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ * 
+ *             ndim = PyArray_NDIM(self)             # <<<<<<<<<<<<<<
+ * 
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  __pyx_v_ndim = PyArray_NDIM(__pyx_v_self);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":211
+ *             ndim = PyArray_NDIM(self)
+ * 
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 copy_shape = 1
+ *             else:
+ */
+  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":212
+ * 
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 copy_shape = 1             # <<<<<<<<<<<<<<
+ *             else:
+ *                 copy_shape = 0
+ */
+    __pyx_v_copy_shape = 1;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":211
+ *             ndim = PyArray_NDIM(self)
+ * 
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 copy_shape = 1
+ *             else:
+ */
+    goto __pyx_L4;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":214
+ *                 copy_shape = 1
+ *             else:
+ *                 copy_shape = 0             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ */
+  /*else*/ {
+    __pyx_v_copy_shape = 0;
+  }
+  __pyx_L4:;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":216
+ *                 copy_shape = 0
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  __pyx_t_2 = (((__pyx_v_flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS) != 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L6_bool_binop_done;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":217
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):             # <<<<<<<<<<<<<<
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ */
+  __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_C_CONTIGUOUS) != 0)) != 0);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L6_bool_binop_done:;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":216
+ *                 copy_shape = 0
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  if (__pyx_t_1) {
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":218
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__17, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 218, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __PYX_ERR(1, 218, __pyx_L1_error)
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":216
+ *                 copy_shape = 0
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":220
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  __pyx_t_2 = (((__pyx_v_flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS) != 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L9_bool_binop_done;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":221
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):             # <<<<<<<<<<<<<<
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ * 
+ */
+  __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_F_CONTIGUOUS) != 0)) != 0);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L9_bool_binop_done:;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":220
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  if (__pyx_t_1) {
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":222
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             info.buf = PyArray_DATA(self)
+ */
+    __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__18, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 222, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __PYX_ERR(1, 222, __pyx_L1_error)
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":220
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":224
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ * 
+ *             info.buf = PyArray_DATA(self)             # <<<<<<<<<<<<<<
+ *             info.ndim = ndim
+ *             if copy_shape:
+ */
+  __pyx_v_info->buf = PyArray_DATA(__pyx_v_self);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":225
+ * 
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim             # <<<<<<<<<<<<<<
+ *             if copy_shape:
+ *                 # Allocate new buffer for strides and shape info.
+ */
+  __pyx_v_info->ndim = __pyx_v_ndim;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":226
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim
+ *             if copy_shape:             # <<<<<<<<<<<<<<
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ */
+  __pyx_t_1 = (__pyx_v_copy_shape != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":229
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)             # <<<<<<<<<<<<<<
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):
+ */
+    __pyx_v_info->strides = ((Py_ssize_t *)malloc((((sizeof(Py_ssize_t)) * ((size_t)__pyx_v_ndim)) * 2)));
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":230
+ *                 # This is allocated as one block, strides first.
+ *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)
+ *                 info.shape = info.strides + ndim             # <<<<<<<<<<<<<<
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ */
+    __pyx_v_info->shape = (__pyx_v_info->strides + __pyx_v_ndim);
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":231
+ *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):             # <<<<<<<<<<<<<<
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ */
+    __pyx_t_4 = __pyx_v_ndim;
+    for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) {
+      __pyx_v_i = __pyx_t_5;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":232
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]             # <<<<<<<<<<<<<<
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ *             else:
+ */
+      (__pyx_v_info->strides[__pyx_v_i]) = (PyArray_STRIDES(__pyx_v_self)[__pyx_v_i]);
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ *                     info.shape[i] = PyArray_DIMS(self)[i]             # <<<<<<<<<<<<<<
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ */
+      (__pyx_v_info->shape[__pyx_v_i]) = (PyArray_DIMS(__pyx_v_self)[__pyx_v_i]);
+    }
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":226
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim
+ *             if copy_shape:             # <<<<<<<<<<<<<<
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ */
+    goto __pyx_L11;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":235
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)             # <<<<<<<<<<<<<<
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL
+ */
+  /*else*/ {
+    __pyx_v_info->strides = ((Py_ssize_t *)PyArray_STRIDES(__pyx_v_self));
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":236
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)             # <<<<<<<<<<<<<<
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ */
+    __pyx_v_info->shape = ((Py_ssize_t *)PyArray_DIMS(__pyx_v_self));
+  }
+  __pyx_L11:;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":237
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL             # <<<<<<<<<<<<<<
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ *             info.readonly = not PyArray_ISWRITEABLE(self)
+ */
+  __pyx_v_info->suboffsets = NULL;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":238
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)             # <<<<<<<<<<<<<<
+ *             info.readonly = not PyArray_ISWRITEABLE(self)
+ * 
+ */
+  __pyx_v_info->itemsize = PyArray_ITEMSIZE(__pyx_v_self);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":239
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ *             info.readonly = not PyArray_ISWRITEABLE(self)             # <<<<<<<<<<<<<<
+ * 
+ *             cdef int t
+ */
+  __pyx_v_info->readonly = (!(PyArray_ISWRITEABLE(__pyx_v_self) != 0));
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":242
+ * 
+ *             cdef int t
+ *             cdef char* f = NULL             # <<<<<<<<<<<<<<
+ *             cdef dtype descr = self.descr
+ *             cdef int offset
+ */
+  __pyx_v_f = NULL;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":243
+ *             cdef int t
+ *             cdef char* f = NULL
+ *             cdef dtype descr = self.descr             # <<<<<<<<<<<<<<
+ *             cdef int offset
+ * 
+ */
+  __pyx_t_3 = ((PyObject *)__pyx_v_self->descr);
+  __Pyx_INCREF(__pyx_t_3);
+  __pyx_v_descr = ((PyArray_Descr *)__pyx_t_3);
+  __pyx_t_3 = 0;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":246
+ *             cdef int offset
+ * 
+ *             cdef bint hasfields = PyDataType_HASFIELDS(descr)             # <<<<<<<<<<<<<<
+ * 
+ *             if not hasfields and not copy_shape:
+ */
+  __pyx_v_hasfields = PyDataType_HASFIELDS(__pyx_v_descr);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":248
+ *             cdef bint hasfields = PyDataType_HASFIELDS(descr)
+ * 
+ *             if not hasfields and not copy_shape:             # <<<<<<<<<<<<<<
+ *                 # do not call releasebuffer
+ *                 info.obj = None
+ */
+  __pyx_t_2 = ((!(__pyx_v_hasfields != 0)) != 0);
+  if (__pyx_t_2) {
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+    goto __pyx_L15_bool_binop_done;
+  }
+  __pyx_t_2 = ((!(__pyx_v_copy_shape != 0)) != 0);
+  __pyx_t_1 = __pyx_t_2;
+  __pyx_L15_bool_binop_done:;
+  if (__pyx_t_1) {
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":250
+ *             if not hasfields and not copy_shape:
+ *                 # do not call releasebuffer
+ *                 info.obj = None             # <<<<<<<<<<<<<<
+ *             else:
+ *                 # need to call releasebuffer
+ */
+    __Pyx_INCREF(Py_None);
+    __Pyx_GIVEREF(Py_None);
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj);
+    __pyx_v_info->obj = Py_None;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":248
+ *             cdef bint hasfields = PyDataType_HASFIELDS(descr)
+ * 
+ *             if not hasfields and not copy_shape:             # <<<<<<<<<<<<<<
+ *                 # do not call releasebuffer
+ *                 info.obj = None
+ */
+    goto __pyx_L14;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":253
+ *             else:
+ *                 # need to call releasebuffer
+ *                 info.obj = self             # <<<<<<<<<<<<<<
+ * 
+ *             if not hasfields:
+ */
+  /*else*/ {
+    __Pyx_INCREF(((PyObject *)__pyx_v_self));
+    __Pyx_GIVEREF(((PyObject *)__pyx_v_self));
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj);
+    __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+  }
+  __pyx_L14:;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":255
+ *                 info.obj = self
+ * 
+ *             if not hasfields:             # <<<<<<<<<<<<<<
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ */
+  __pyx_t_1 = ((!(__pyx_v_hasfields != 0)) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":256
+ * 
+ *             if not hasfields:
+ *                 t = descr.type_num             # <<<<<<<<<<<<<<
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ */
+    __pyx_t_4 = __pyx_v_descr->type_num;
+    __pyx_v_t = __pyx_t_4;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
+ *             if not hasfields:
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    __pyx_t_2 = ((__pyx_v_descr->byteorder == '>') != 0);
+    if (!__pyx_t_2) {
+      goto __pyx_L20_next_or;
+    } else {
+    }
+    __pyx_t_2 = (__pyx_v_little_endian != 0);
+    if (!__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L19_bool_binop_done;
+    }
+    __pyx_L20_next_or:;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":258
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"
+ */
+    __pyx_t_2 = ((__pyx_v_descr->byteorder == '<') != 0);
+    if (__pyx_t_2) {
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+      goto __pyx_L19_bool_binop_done;
+    }
+    __pyx_t_2 = ((!(__pyx_v_little_endian != 0)) != 0);
+    __pyx_t_1 = __pyx_t_2;
+    __pyx_L19_bool_binop_done:;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
+ *             if not hasfields:
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    if (__pyx_t_1) {
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__19, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 259, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(1, 259, __pyx_L1_error)
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
+ *             if not hasfields:
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    }
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":260
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"
+ */
+    switch (__pyx_v_t) {
+      case NPY_BYTE:
+      __pyx_v_f = ((char *)"b");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"
+ */
+      case NPY_UBYTE:
+      __pyx_v_f = ((char *)"B");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":262
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"
+ */
+      case NPY_SHORT:
+      __pyx_v_f = ((char *)"h");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":263
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"
+ */
+      case NPY_USHORT:
+      __pyx_v_f = ((char *)"H");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":264
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"
+ */
+      case NPY_INT:
+      __pyx_v_f = ((char *)"i");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":265
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"
+ */
+      case NPY_UINT:
+      __pyx_v_f = ((char *)"I");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":266
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ */
+      case NPY_LONG:
+      __pyx_v_f = ((char *)"l");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":267
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ */
+      case NPY_ULONG:
+      __pyx_v_f = ((char *)"L");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":268
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"
+ */
+      case NPY_LONGLONG:
+      __pyx_v_f = ((char *)"q");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":269
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ */
+      case NPY_ULONGLONG:
+      __pyx_v_f = ((char *)"Q");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":270
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ */
+      case NPY_FLOAT:
+      __pyx_v_f = ((char *)"f");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":271
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ */
+      case NPY_DOUBLE:
+      __pyx_v_f = ((char *)"d");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":272
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ */
+      case NPY_LONGDOUBLE:
+      __pyx_v_f = ((char *)"g");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":273
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ */
+      case NPY_CFLOAT:
+      __pyx_v_f = ((char *)"Zf");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"
+ */
+      case NPY_CDOUBLE:
+      __pyx_v_f = ((char *)"Zd");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":275
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_OBJECT:      f = "O"
+ *                 else:
+ */
+      case NPY_CLONGDOUBLE:
+      __pyx_v_f = ((char *)"Zg");
+      break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":276
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"             # <<<<<<<<<<<<<<
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+      case NPY_OBJECT:
+      __pyx_v_f = ((char *)"O");
+      break;
+      default:
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":278
+ *                 elif t == NPY_OBJECT:      f = "O"
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
+ *                 info.format = f
+ *                 return
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_int(__pyx_v_t); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 278, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_6 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_t_3); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 278, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 278, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_GIVEREF(__pyx_t_6);
+      PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_6);
+      __pyx_t_6 = 0;
+      __pyx_t_6 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_3, NULL); if (unlikely(!__pyx_t_6)) __PYX_ERR(1, 278, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_Raise(__pyx_t_6, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+      __PYX_ERR(1, 278, __pyx_L1_error)
+      break;
+    }
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":279
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *                 info.format = f             # <<<<<<<<<<<<<<
+ *                 return
+ *             else:
+ */
+    __pyx_v_info->format = __pyx_v_f;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":280
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *                 info.format = f
+ *                 return             # <<<<<<<<<<<<<<
+ *             else:
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
+ */
+    __pyx_r = 0;
+    goto __pyx_L0;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":255
+ *                 info.obj = self
+ * 
+ *             if not hasfields:             # <<<<<<<<<<<<<<
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ */
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":282
+ *                 return
+ *             else:
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)             # <<<<<<<<<<<<<<
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0
+ */
+  /*else*/ {
+    __pyx_v_info->format = ((char *)malloc(0xFF));
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":283
+ *             else:
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
+ *                 info.format[0] = c'^' # Native data types, manual alignment             # <<<<<<<<<<<<<<
+ *                 offset = 0
+ *                 f = _util_dtypestring(descr, info.format + 1,
+ */
+    (__pyx_v_info->format[0]) = '^';
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":284
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0             # <<<<<<<<<<<<<<
+ *                 f = _util_dtypestring(descr, info.format + 1,
+ *                                       info.format + _buffer_format_string_len,
+ */
+    __pyx_v_offset = 0;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":285
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0
+ *                 f = _util_dtypestring(descr, info.format + 1,             # <<<<<<<<<<<<<<
+ *                                       info.format + _buffer_format_string_len,
+ *                                       &offset)
+ */
+    __pyx_t_7 = __pyx_f_5numpy__util_dtypestring(__pyx_v_descr, (__pyx_v_info->format + 1), (__pyx_v_info->format + 0xFF), (&__pyx_v_offset)); if (unlikely(__pyx_t_7 == NULL)) __PYX_ERR(1, 285, __pyx_L1_error)
+    __pyx_v_f = __pyx_t_7;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288
+ *                                       info.format + _buffer_format_string_len,
+ *                                       &offset)
+ *                 f[0] = c'\0' # Terminate format string             # <<<<<<<<<<<<<<
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ */
+    (__pyx_v_f[0]) = '\x00';
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":197
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fullfill the PEP.
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("numpy.ndarray.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  if (__pyx_v_info != NULL && __pyx_v_info->obj != NULL) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = NULL;
+  }
+  goto __pyx_L2;
+  __pyx_L0:;
+  if (__pyx_v_info != NULL && __pyx_v_info->obj == Py_None) {
+    __Pyx_GOTREF(Py_None);
+    __Pyx_DECREF(Py_None); __pyx_v_info->obj = NULL;
+  }
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_descr);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":290
+ *                 f[0] = c'\0' # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info); /*proto*/
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__releasebuffer__ (wrapper)", 0);
+  __pyx_pf_5numpy_7ndarray_2__releasebuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("__releasebuffer__", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":291
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):             # <<<<<<<<<<<<<<
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  __pyx_t_1 = (PyArray_HASFIELDS(__pyx_v_self) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":292
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)             # <<<<<<<<<<<<<<
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 stdlib.free(info.strides)
+ */
+    free(__pyx_v_info->format);
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":291
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):             # <<<<<<<<<<<<<<
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":293
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 stdlib.free(info.strides)
+ *                 # info.shape was stored after info.strides in the same block
+ */
+  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":294
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 stdlib.free(info.strides)             # <<<<<<<<<<<<<<
+ *                 # info.shape was stored after info.strides in the same block
+ * 
+ */
+    free(__pyx_v_info->strides);
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":293
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 stdlib.free(info.strides)
+ *                 # info.shape was stored after info.strides in the same block
+ */
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":290
+ *                 f[0] = c'\0' # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":770
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ *     return PyArray_MultiIterNew(1, <void*>a)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 771, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":770
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":773
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":774
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 774, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":773
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":776
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":777
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 777, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":776
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":779
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":780
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 780, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":779
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":782
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":783
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 783, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":782
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":785
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
+ *     # Recursive utility function used in __getbuffer__ to get format
+ *     # string. The new location in the format string is returned.
+ */
+
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx_v_descr, char *__pyx_v_f, char *__pyx_v_end, int *__pyx_v_offset) {
+  PyArray_Descr *__pyx_v_child = 0;
+  int __pyx_v_endian_detector;
+  int __pyx_v_little_endian;
+  PyObject *__pyx_v_fields = 0;
+  PyObject *__pyx_v_childname = NULL;
+  PyObject *__pyx_v_new_offset = NULL;
+  PyObject *__pyx_v_t = NULL;
+  char *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  Py_ssize_t __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  int __pyx_t_7;
+  long __pyx_t_8;
+  char *__pyx_t_9;
+  __Pyx_RefNannySetupContext("_util_dtypestring", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":790
+ * 
+ *     cdef dtype child
+ *     cdef int endian_detector = 1             # <<<<<<<<<<<<<<
+ *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ *     cdef tuple fields
+ */
+  __pyx_v_endian_detector = 1;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791
+ *     cdef dtype child
+ *     cdef int endian_detector = 1
+ *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
+ *     cdef tuple fields
+ * 
+ */
+  __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794
+ *     cdef tuple fields
+ * 
+ *     for childname in descr.names:             # <<<<<<<<<<<<<<
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields
+ */
+  if (unlikely(__pyx_v_descr->names == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+    __PYX_ERR(1, 794, __pyx_L1_error)
+  }
+  __pyx_t_1 = __pyx_v_descr->names; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0;
+  for (;;) {
+    if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break;
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_3); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(1, 794, __pyx_L1_error)
+    #else
+    __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 794, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    #endif
+    __Pyx_XDECREF_SET(__pyx_v_childname, __pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":795
+ * 
+ *     for childname in descr.names:
+ *         fields = descr.fields[childname]             # <<<<<<<<<<<<<<
+ *         child, new_offset = fields
+ * 
+ */
+    if (unlikely(__pyx_v_descr->fields == Py_None)) {
+      PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable");
+      __PYX_ERR(1, 795, __pyx_L1_error)
+    }
+    __pyx_t_3 = __Pyx_PyDict_GetItem(__pyx_v_descr->fields, __pyx_v_childname); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 795, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    if (!(likely(PyTuple_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "tuple", Py_TYPE(__pyx_t_3)->tp_name), 0))) __PYX_ERR(1, 795, __pyx_L1_error)
+    __Pyx_XDECREF_SET(__pyx_v_fields, ((PyObject*)__pyx_t_3));
+    __pyx_t_3 = 0;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":796
+ *     for childname in descr.names:
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields             # <<<<<<<<<<<<<<
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ */
+    if (likely(__pyx_v_fields != Py_None)) {
+      PyObject* sequence = __pyx_v_fields;
+      #if !CYTHON_COMPILING_IN_PYPY
+      Py_ssize_t size = Py_SIZE(sequence);
+      #else
+      Py_ssize_t size = PySequence_Size(sequence);
+      #endif
+      if (unlikely(size != 2)) {
+        if (size > 2) __Pyx_RaiseTooManyValuesError(2);
+        else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
+        __PYX_ERR(1, 796, __pyx_L1_error)
+      }
+      #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+      __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0); 
+      __pyx_t_4 = PyTuple_GET_ITEM(sequence, 1); 
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_4);
+      #else
+      __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 796, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 796, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      #endif
+    } else {
+      __Pyx_RaiseNoneNotIterableError(); __PYX_ERR(1, 796, __pyx_L1_error)
+    }
+    if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_dtype))))) __PYX_ERR(1, 796, __pyx_L1_error)
+    __Pyx_XDECREF_SET(__pyx_v_child, ((PyArray_Descr *)__pyx_t_3));
+    __pyx_t_3 = 0;
+    __Pyx_XDECREF_SET(__pyx_v_new_offset, __pyx_t_4);
+    __pyx_t_4 = 0;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":798
+ *         child, new_offset = fields
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:             # <<<<<<<<<<<<<<
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 798, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_3 = PyNumber_Subtract(__pyx_v_new_offset, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 798, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) __PYX_ERR(1, 798, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_6 = ((((__pyx_v_end - __pyx_v_f) - ((int)__pyx_t_5)) < 15) != 0);
+    if (__pyx_t_6) {
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":799
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__20, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 799, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(1, 799, __pyx_L1_error)
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":798
+ *         child, new_offset = fields
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:             # <<<<<<<<<<<<<<
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ */
+    }
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":801
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    __pyx_t_7 = ((__pyx_v_child->byteorder == '>') != 0);
+    if (!__pyx_t_7) {
+      goto __pyx_L8_next_or;
+    } else {
+    }
+    __pyx_t_7 = (__pyx_v_little_endian != 0);
+    if (!__pyx_t_7) {
+    } else {
+      __pyx_t_6 = __pyx_t_7;
+      goto __pyx_L7_bool_binop_done;
+    }
+    __pyx_L8_next_or:;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":802
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *             raise ValueError(u"Non-native byte order not supported")
+ *             # One could encode it in the format string and have Cython
+ */
+    __pyx_t_7 = ((__pyx_v_child->byteorder == '<') != 0);
+    if (__pyx_t_7) {
+    } else {
+      __pyx_t_6 = __pyx_t_7;
+      goto __pyx_L7_bool_binop_done;
+    }
+    __pyx_t_7 = ((!(__pyx_v_little_endian != 0)) != 0);
+    __pyx_t_6 = __pyx_t_7;
+    __pyx_L7_bool_binop_done:;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":801
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    if (__pyx_t_6) {
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *             # One could encode it in the format string and have Cython
+ *             # complain instead, BUT: < and > in format strings also imply
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__21, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 803, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __PYX_ERR(1, 803, __pyx_L1_error)
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":801
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    }
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":813
+ * 
+ *         # Output padding bytes
+ *         while offset[0] < new_offset:             # <<<<<<<<<<<<<<
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1
+ */
+    while (1) {
+      __pyx_t_3 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 813, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_t_3, __pyx_v_new_offset, Py_LT); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 813, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 813, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (!__pyx_t_6) break;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":814
+ *         # Output padding bytes
+ *         while offset[0] < new_offset:
+ *             f[0] = 120 # "x"; pad byte             # <<<<<<<<<<<<<<
+ *             f += 1
+ *             offset[0] += 1
+ */
+      (__pyx_v_f[0]) = 0x78;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":815
+ *         while offset[0] < new_offset:
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1             # <<<<<<<<<<<<<<
+ *             offset[0] += 1
+ * 
+ */
+      __pyx_v_f = (__pyx_v_f + 1);
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":816
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1
+ *             offset[0] += 1             # <<<<<<<<<<<<<<
+ * 
+ *         offset[0] += child.itemsize
+ */
+      __pyx_t_8 = 0;
+      (__pyx_v_offset[__pyx_t_8]) = ((__pyx_v_offset[__pyx_t_8]) + 1);
+    }
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":818
+ *             offset[0] += 1
+ * 
+ *         offset[0] += child.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *         if not PyDataType_HASFIELDS(child):
+ */
+    __pyx_t_8 = 0;
+    (__pyx_v_offset[__pyx_t_8]) = ((__pyx_v_offset[__pyx_t_8]) + __pyx_v_child->elsize);
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":820
+ *         offset[0] += child.itemsize
+ * 
+ *         if not PyDataType_HASFIELDS(child):             # <<<<<<<<<<<<<<
+ *             t = child.type_num
+ *             if end - f < 5:
+ */
+    __pyx_t_6 = ((!(PyDataType_HASFIELDS(__pyx_v_child) != 0)) != 0);
+    if (__pyx_t_6) {
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":821
+ * 
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num             # <<<<<<<<<<<<<<
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_child->type_num); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 821, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_XDECREF_SET(__pyx_v_t, __pyx_t_4);
+      __pyx_t_4 = 0;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":822
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num
+ *             if end - f < 5:             # <<<<<<<<<<<<<<
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ * 
+ */
+      __pyx_t_6 = (((__pyx_v_end - __pyx_v_f) < 5) != 0);
+      if (__pyx_t_6) {
+
+        /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823
+ *             t = child.type_num
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__22, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 823, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __PYX_ERR(1, 823, __pyx_L1_error)
+
+        /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":822
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num
+ *             if end - f < 5:             # <<<<<<<<<<<<<<
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ * 
+ */
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":826
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_BYTE); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 826, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 826, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 826, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 98;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":827
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_UBYTE); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 827, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 827, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 827, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 66;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":828
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_SHORT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 828, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 828, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 828, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x68;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":829
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_USHORT); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 829, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 829, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 829, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 72;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":830
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_INT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 830, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 830, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 830, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x69;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":831
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_UINT); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 831, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 831, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 831, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 73;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":832
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONG); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 832, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 832, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 832, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x6C;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":833
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_ULONG); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 833, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 833, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 833, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 76;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":834
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONGLONG); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 834, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 834, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 834, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x71;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":835
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_ULONGLONG); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 835, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 835, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 835, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 81;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":836
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_FLOAT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 836, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 836, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 836, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x66;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":837
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_DOUBLE); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 837, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 837, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 837, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x64;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":838
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_LONGDOUBLE); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 838, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 838, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 838, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 0x67;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":839
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CFLOAT); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 839, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 839, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 839, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 0x66;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":840
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CDOUBLE); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 840, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 840, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 840, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 0x64;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":841
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg             # <<<<<<<<<<<<<<
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ *             else:
+ */
+      __pyx_t_3 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_CLONGDOUBLE); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 841, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 841, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 841, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 0x67;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":842
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"             # <<<<<<<<<<<<<<
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_enum__NPY_TYPES(NPY_OBJECT); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 842, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 842, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(1, 842, __pyx_L1_error)
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 79;
+        goto __pyx_L15;
+      }
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":844
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
+ *             f += 1
+ *         else:
+ */
+      /*else*/ {
+        __pyx_t_3 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_v_t); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 844, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(1, 844, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_GIVEREF(__pyx_t_3);
+        PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3);
+        __pyx_t_3 = 0;
+        __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_4, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 844, __pyx_L1_error)
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        __PYX_ERR(1, 844, __pyx_L1_error)
+      }
+      __pyx_L15:;
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":845
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *             f += 1             # <<<<<<<<<<<<<<
+ *         else:
+ *             # Cython ignores struct boundary information ("T{...}"),
+ */
+      __pyx_v_f = (__pyx_v_f + 1);
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":820
+ *         offset[0] += child.itemsize
+ * 
+ *         if not PyDataType_HASFIELDS(child):             # <<<<<<<<<<<<<<
+ *             t = child.type_num
+ *             if end - f < 5:
+ */
+      goto __pyx_L13;
+    }
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":849
+ *             # Cython ignores struct boundary information ("T{...}"),
+ *             # so don't output it
+ *             f = _util_dtypestring(child, f, end, offset)             # <<<<<<<<<<<<<<
+ *     return f
+ * 
+ */
+    /*else*/ {
+      __pyx_t_9 = __pyx_f_5numpy__util_dtypestring(__pyx_v_child, __pyx_v_f, __pyx_v_end, __pyx_v_offset); if (unlikely(__pyx_t_9 == NULL)) __PYX_ERR(1, 849, __pyx_L1_error)
+      __pyx_v_f = __pyx_t_9;
+    }
+    __pyx_L13:;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794
+ *     cdef tuple fields
+ * 
+ *     for childname in descr.names:             # <<<<<<<<<<<<<<
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields
+ */
+  }
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":850
+ *             # so don't output it
+ *             f = _util_dtypestring(child, f, end, offset)
+ *     return f             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_f;
+  goto __pyx_L0;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":785
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
+ *     # Recursive utility function used in __getbuffer__ to get format
+ *     # string. The new location in the format string is returned.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("numpy._util_dtypestring", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_child);
+  __Pyx_XDECREF(__pyx_v_fields);
+  __Pyx_XDECREF(__pyx_v_childname);
+  __Pyx_XDECREF(__pyx_v_new_offset);
+  __Pyx_XDECREF(__pyx_v_t);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":966
+ * 
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+  PyObject *__pyx_v_baseptr;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  __Pyx_RefNannySetupContext("set_array_base", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":968
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *      cdef PyObject* baseptr
+ *      if base is None:             # <<<<<<<<<<<<<<
+ *          baseptr = NULL
+ *      else:
+ */
+  __pyx_t_1 = (__pyx_v_base == Py_None);
+  __pyx_t_2 = (__pyx_t_1 != 0);
+  if (__pyx_t_2) {
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":969
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ *          baseptr = NULL             # <<<<<<<<<<<<<<
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!
+ */
+    __pyx_v_baseptr = NULL;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":968
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *      cdef PyObject* baseptr
+ *      if base is None:             # <<<<<<<<<<<<<<
+ *          baseptr = NULL
+ *      else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":971
+ *          baseptr = NULL
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!             # <<<<<<<<<<<<<<
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)
+ */
+  /*else*/ {
+    Py_INCREF(__pyx_v_base);
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":972
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!
+ *          baseptr = <PyObject*>base             # <<<<<<<<<<<<<<
+ *      Py_XDECREF(arr.base)
+ *      arr.base = baseptr
+ */
+    __pyx_v_baseptr = ((PyObject *)__pyx_v_base);
+  }
+  __pyx_L3:;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":973
+ *          Py_INCREF(base) # important to do this before decref below!
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)             # <<<<<<<<<<<<<<
+ *      arr.base = baseptr
+ * 
+ */
+  Py_XDECREF(__pyx_v_arr->base);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":974
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)
+ *      arr.base = baseptr             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ */
+  __pyx_v_arr->base = __pyx_v_baseptr;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":966
+ * 
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("get_array_base", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":977
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     else:
+ */
+  __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":978
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:
+ *         return None             # <<<<<<<<<<<<<<
+ *     else:
+ *         return <object>arr.base
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(Py_None);
+    __pyx_r = Py_None;
+    goto __pyx_L0;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":977
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     else:
+ */
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":980
+ *         return None
+ *     else:
+ *         return <object>arr.base             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(((PyObject *)__pyx_v_arr->base));
+    __pyx_r = ((PyObject *)__pyx_v_arr->base);
+    goto __pyx_L0;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":985
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_array()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  __Pyx_RefNannySetupContext("import_array", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":986
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_array()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":987
+ * cdef inline int import_array() except -1:
+ *     try:
+ *         _import_array()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ */
+      __pyx_t_4 = _import_array(); if (unlikely(__pyx_t_4 == -1)) __PYX_ERR(1, 987, __pyx_L3_error)
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":986
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_array()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L10_try_end;
+    __pyx_L3_error:;
+    __Pyx_PyThreadState_assign
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":988
+ *     try:
+ *         _import_array()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 988, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":989
+ *         _import_array()
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__23, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 989, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 989, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":986
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_array()
+ *     except Exception:
+ */
+    __Pyx_PyThreadState_assign
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L10_try_end:;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":985
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_array()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":991
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  __Pyx_RefNannySetupContext("import_umath", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":992
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":993
+ * cdef inline int import_umath() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == -1)) __PYX_ERR(1, 993, __pyx_L3_error)
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":992
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L10_try_end;
+    __pyx_L3_error:;
+    __Pyx_PyThreadState_assign
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":994
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 994, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":995
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__24, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 995, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 995, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":992
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __Pyx_PyThreadState_assign
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L10_try_end:;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":991
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":997
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  __Pyx_RefNannySetupContext("import_ufunc", 0);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":998
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":999
+ * cdef inline int import_ufunc() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == -1)) __PYX_ERR(1, 999, __pyx_L3_error)
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":998
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L10_try_end;
+    __pyx_L3_error:;
+    __Pyx_PyThreadState_assign
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1000
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 1000, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1001
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__25, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 1001, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 1001, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":998
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __Pyx_PyThreadState_assign
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L10_try_end:;
+  }
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":997
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_tp_new_11pycocotools_5_mask_RLEs(PyTypeObject *t, PyObject *a, PyObject *k) {
+  PyObject *o;
+  if (likely((t->tp_flags & Py_TPFLAGS_IS_ABSTRACT) == 0)) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  if (unlikely(__pyx_pw_11pycocotools_5_mask_4RLEs_1__cinit__(o, a, k) < 0)) goto bad;
+  return o;
+  bad:
+  Py_DECREF(o); o = 0;
+  return NULL;
+}
+
+static void __pyx_tp_dealloc_11pycocotools_5_mask_RLEs(PyObject *o) {
+  #if PY_VERSION_HEX >= 0x030400a1
+  if (unlikely(Py_TYPE(o)->tp_finalize) && (!PyType_IS_GC(Py_TYPE(o)) || !_PyGC_FINALIZED(o))) {
+    if (PyObject_CallFinalizerFromDealloc(o)) return;
+  }
+  #endif
+  {
+    PyObject *etype, *eval, *etb;
+    PyErr_Fetch(&etype, &eval, &etb);
+    ++Py_REFCNT(o);
+    __pyx_pw_11pycocotools_5_mask_4RLEs_3__dealloc__(o);
+    --Py_REFCNT(o);
+    PyErr_Restore(etype, eval, etb);
+  }
+  (*Py_TYPE(o)->tp_free)(o);
+}
+
+static PyObject *__pyx_tp_getattro_11pycocotools_5_mask_RLEs(PyObject *o, PyObject *n) {
+  PyObject *v = PyObject_GenericGetAttr(o, n);
+  if (!v && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+    PyErr_Clear();
+    v = __pyx_pw_11pycocotools_5_mask_4RLEs_5__getattr__(o, n);
+  }
+  return v;
+}
+
+static PyMethodDef __pyx_methods_11pycocotools_5_mask_RLEs[] = {
+  {"__getattr__", (PyCFunction)__pyx_pw_11pycocotools_5_mask_4RLEs_5__getattr__, METH_O|METH_COEXIST, 0},
+  {0, 0, 0, 0}
+};
+
+static PyTypeObject __pyx_type_11pycocotools_5_mask_RLEs = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "pycocotools._mask.RLEs", /*tp_name*/
+  sizeof(struct __pyx_obj_11pycocotools_5_mask_RLEs), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_11pycocotools_5_mask_RLEs, /*tp_dealloc*/
+  0, /*tp_print*/
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  0, /*tp_repr*/
+  0, /*tp_as_number*/
+  0, /*tp_as_sequence*/
+  0, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  0, /*tp_str*/
+  __pyx_tp_getattro_11pycocotools_5_mask_RLEs, /*tp_getattro*/
+  0, /*tp_setattro*/
+  0, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE, /*tp_flags*/
+  0, /*tp_doc*/
+  0, /*tp_traverse*/
+  0, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_11pycocotools_5_mask_RLEs, /*tp_methods*/
+  0, /*tp_members*/
+  0, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  0, /*tp_dictoffset*/
+  0, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_11pycocotools_5_mask_RLEs, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  0, /*tp_finalize*/
+  #endif
+};
+
+static PyObject *__pyx_tp_new_11pycocotools_5_mask_Masks(PyTypeObject *t, PyObject *a, PyObject *k) {
+  PyObject *o;
+  if (likely((t->tp_flags & Py_TPFLAGS_IS_ABSTRACT) == 0)) {
+    o = (*t->tp_alloc)(t, 0);
+  } else {
+    o = (PyObject *) PyBaseObject_Type.tp_new(t, __pyx_empty_tuple, 0);
+  }
+  if (unlikely(!o)) return 0;
+  if (unlikely(__pyx_pw_11pycocotools_5_mask_5Masks_1__cinit__(o, a, k) < 0)) goto bad;
+  return o;
+  bad:
+  Py_DECREF(o); o = 0;
+  return NULL;
+}
+
+static void __pyx_tp_dealloc_11pycocotools_5_mask_Masks(PyObject *o) {
+  #if PY_VERSION_HEX >= 0x030400a1
+  if (unlikely(Py_TYPE(o)->tp_finalize) && (!PyType_IS_GC(Py_TYPE(o)) || !_PyGC_FINALIZED(o))) {
+    if (PyObject_CallFinalizerFromDealloc(o)) return;
+  }
+  #endif
+  (*Py_TYPE(o)->tp_free)(o);
+}
+
+static PyMethodDef __pyx_methods_11pycocotools_5_mask_Masks[] = {
+  {"__array__", (PyCFunction)__pyx_pw_11pycocotools_5_mask_5Masks_3__array__, METH_NOARGS, 0},
+  {0, 0, 0, 0}
+};
+
+static PyTypeObject __pyx_type_11pycocotools_5_mask_Masks = {
+  PyVarObject_HEAD_INIT(0, 0)
+  "pycocotools._mask.Masks", /*tp_name*/
+  sizeof(struct __pyx_obj_11pycocotools_5_mask_Masks), /*tp_basicsize*/
+  0, /*tp_itemsize*/
+  __pyx_tp_dealloc_11pycocotools_5_mask_Masks, /*tp_dealloc*/
+  0, /*tp_print*/
+  0, /*tp_getattr*/
+  0, /*tp_setattr*/
+  #if PY_MAJOR_VERSION < 3
+  0, /*tp_compare*/
+  #endif
+  #if PY_MAJOR_VERSION >= 3
+  0, /*tp_as_async*/
+  #endif
+  0, /*tp_repr*/
+  0, /*tp_as_number*/
+  0, /*tp_as_sequence*/
+  0, /*tp_as_mapping*/
+  0, /*tp_hash*/
+  0, /*tp_call*/
+  0, /*tp_str*/
+  0, /*tp_getattro*/
+  0, /*tp_setattro*/
+  0, /*tp_as_buffer*/
+  Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE, /*tp_flags*/
+  0, /*tp_doc*/
+  0, /*tp_traverse*/
+  0, /*tp_clear*/
+  0, /*tp_richcompare*/
+  0, /*tp_weaklistoffset*/
+  0, /*tp_iter*/
+  0, /*tp_iternext*/
+  __pyx_methods_11pycocotools_5_mask_Masks, /*tp_methods*/
+  0, /*tp_members*/
+  0, /*tp_getset*/
+  0, /*tp_base*/
+  0, /*tp_dict*/
+  0, /*tp_descr_get*/
+  0, /*tp_descr_set*/
+  0, /*tp_dictoffset*/
+  0, /*tp_init*/
+  0, /*tp_alloc*/
+  __pyx_tp_new_11pycocotools_5_mask_Masks, /*tp_new*/
+  0, /*tp_free*/
+  0, /*tp_is_gc*/
+  0, /*tp_bases*/
+  0, /*tp_mro*/
+  0, /*tp_cache*/
+  0, /*tp_subclasses*/
+  0, /*tp_weaklist*/
+  0, /*tp_del*/
+  0, /*tp_version_tag*/
+  #if PY_VERSION_HEX >= 0x030400a1
+  0, /*tp_finalize*/
+  #endif
+};
+
+static PyMethodDef __pyx_methods[] = {
+  {0, 0, 0, 0}
+};
+
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef __pyx_moduledef = {
+  #if PY_VERSION_HEX < 0x03020000
+    { PyObject_HEAD_INIT(NULL) NULL, 0, NULL },
+  #else
+    PyModuleDef_HEAD_INIT,
+  #endif
+    "_mask",
+    0, /* m_doc */
+    -1, /* m_size */
+    __pyx_methods /* m_methods */,
+    NULL, /* m_reload */
+    NULL, /* m_traverse */
+    NULL, /* m_clear */
+    NULL /* m_free */
+};
+#endif
+
+static __Pyx_StringTabEntry __pyx_string_tab[] = {
+  {&__pyx_n_s_AttributeError, __pyx_k_AttributeError, sizeof(__pyx_k_AttributeError), 0, 0, 1, 1},
+  {&__pyx_n_s_F, __pyx_k_F, sizeof(__pyx_k_F), 0, 0, 1, 1},
+  {&__pyx_kp_u_Format_string_allocated_too_shor, __pyx_k_Format_string_allocated_too_shor, sizeof(__pyx_k_Format_string_allocated_too_shor), 0, 1, 0, 0},
+  {&__pyx_kp_u_Format_string_allocated_too_shor_2, __pyx_k_Format_string_allocated_too_shor_2, sizeof(__pyx_k_Format_string_allocated_too_shor_2), 0, 1, 0, 0},
+  {&__pyx_n_s_ImportError, __pyx_k_ImportError, sizeof(__pyx_k_ImportError), 0, 0, 1, 1},
+  {&__pyx_n_s_N, __pyx_k_N, sizeof(__pyx_k_N), 0, 0, 1, 1},
+  {&__pyx_kp_u_Non_native_byte_order_not_suppor, __pyx_k_Non_native_byte_order_not_suppor, sizeof(__pyx_k_Non_native_byte_order_not_suppor), 0, 1, 0, 0},
+  {&__pyx_n_s_R, __pyx_k_R, sizeof(__pyx_k_R), 0, 0, 1, 1},
+  {&__pyx_n_s_Rs, __pyx_k_Rs, sizeof(__pyx_k_Rs), 0, 0, 1, 1},
+  {&__pyx_n_s_RuntimeError, __pyx_k_RuntimeError, sizeof(__pyx_k_RuntimeError), 0, 0, 1, 1},
+  {&__pyx_kp_s_The_dt_and_gt_should_have_the_sa, __pyx_k_The_dt_and_gt_should_have_the_sa, sizeof(__pyx_k_The_dt_and_gt_should_have_the_sa), 0, 0, 1, 0},
+  {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1},
+  {&__pyx_n_s_a, __pyx_k_a, sizeof(__pyx_k_a), 0, 0, 1, 1},
+  {&__pyx_n_s_a_2, __pyx_k_a_2, sizeof(__pyx_k_a_2), 0, 0, 1, 1},
+  {&__pyx_n_s_all, __pyx_k_all, sizeof(__pyx_k_all), 0, 0, 1, 1},
+  {&__pyx_n_s_area, __pyx_k_area, sizeof(__pyx_k_area), 0, 0, 1, 1},
+  {&__pyx_n_s_array, __pyx_k_array, sizeof(__pyx_k_array), 0, 0, 1, 1},
+  {&__pyx_n_s_astype, __pyx_k_astype, sizeof(__pyx_k_astype), 0, 0, 1, 1},
+  {&__pyx_n_s_author, __pyx_k_author, sizeof(__pyx_k_author), 0, 0, 1, 1},
+  {&__pyx_n_s_bb, __pyx_k_bb, sizeof(__pyx_k_bb), 0, 0, 1, 1},
+  {&__pyx_n_s_bbIou, __pyx_k_bbIou, sizeof(__pyx_k_bbIou), 0, 0, 1, 1},
+  {&__pyx_n_s_bb_2, __pyx_k_bb_2, sizeof(__pyx_k_bb_2), 0, 0, 1, 1},
+  {&__pyx_n_s_c_string, __pyx_k_c_string, sizeof(__pyx_k_c_string), 0, 0, 1, 1},
+  {&__pyx_n_s_cnts, __pyx_k_cnts, sizeof(__pyx_k_cnts), 0, 0, 1, 1},
+  {&__pyx_n_s_counts, __pyx_k_counts, sizeof(__pyx_k_counts), 0, 0, 1, 1},
+  {&__pyx_n_s_data, __pyx_k_data, sizeof(__pyx_k_data), 0, 0, 1, 1},
+  {&__pyx_n_s_decode, __pyx_k_decode, sizeof(__pyx_k_decode), 0, 0, 1, 1},
+  {&__pyx_n_s_double, __pyx_k_double, sizeof(__pyx_k_double), 0, 0, 1, 1},
+  {&__pyx_n_s_dt, __pyx_k_dt, sizeof(__pyx_k_dt), 0, 0, 1, 1},
+  {&__pyx_n_s_dtype, __pyx_k_dtype, sizeof(__pyx_k_dtype), 0, 0, 1, 1},
+  {&__pyx_n_s_encode, __pyx_k_encode, sizeof(__pyx_k_encode), 0, 0, 1, 1},
+  {&__pyx_n_s_enumerate, __pyx_k_enumerate, sizeof(__pyx_k_enumerate), 0, 0, 1, 1},
+  {&__pyx_n_s_frBbox, __pyx_k_frBbox, sizeof(__pyx_k_frBbox), 0, 0, 1, 1},
+  {&__pyx_n_s_frPoly, __pyx_k_frPoly, sizeof(__pyx_k_frPoly), 0, 0, 1, 1},
+  {&__pyx_n_s_frPyObjects, __pyx_k_frPyObjects, sizeof(__pyx_k_frPyObjects), 0, 0, 1, 1},
+  {&__pyx_n_s_frString, __pyx_k_frString, sizeof(__pyx_k_frString), 0, 0, 1, 1},
+  {&__pyx_n_s_frUncompressedRLE, __pyx_k_frUncompressedRLE, sizeof(__pyx_k_frUncompressedRLE), 0, 0, 1, 1},
+  {&__pyx_n_s_gt, __pyx_k_gt, sizeof(__pyx_k_gt), 0, 0, 1, 1},
+  {&__pyx_n_s_h, __pyx_k_h, sizeof(__pyx_k_h), 0, 0, 1, 1},
+  {&__pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_k_home_ftian_code_py_faster_rcnn, sizeof(__pyx_k_home_ftian_code_py_faster_rcnn), 0, 0, 1, 0},
+  {&__pyx_n_s_i, __pyx_k_i, sizeof(__pyx_k_i), 0, 0, 1, 1},
+  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+  {&__pyx_kp_s_input_data_type_not_allowed, __pyx_k_input_data_type_not_allowed, sizeof(__pyx_k_input_data_type_not_allowed), 0, 0, 1, 0},
+  {&__pyx_kp_s_input_type_is_not_supported, __pyx_k_input_type_is_not_supported, sizeof(__pyx_k_input_type_is_not_supported), 0, 0, 1, 0},
+  {&__pyx_n_s_intersect, __pyx_k_intersect, sizeof(__pyx_k_intersect), 0, 0, 1, 1},
+  {&__pyx_n_s_iou, __pyx_k_iou, sizeof(__pyx_k_iou), 0, 0, 1, 1},
+  {&__pyx_n_s_iouFun, __pyx_k_iouFun, sizeof(__pyx_k_iouFun), 0, 0, 1, 1},
+  {&__pyx_n_s_iou_2, __pyx_k_iou_2, sizeof(__pyx_k_iou_2), 0, 0, 1, 1},
+  {&__pyx_n_s_iou_locals__bbIou, __pyx_k_iou_locals__bbIou, sizeof(__pyx_k_iou_locals__bbIou), 0, 0, 1, 1},
+  {&__pyx_n_s_iou_locals__len, __pyx_k_iou_locals__len, sizeof(__pyx_k_iou_locals__len), 0, 0, 1, 1},
+  {&__pyx_n_s_iou_locals__preproc, __pyx_k_iou_locals__preproc, sizeof(__pyx_k_iou_locals__preproc), 0, 0, 1, 1},
+  {&__pyx_n_s_iou_locals__rleIou, __pyx_k_iou_locals__rleIou, sizeof(__pyx_k_iou_locals__rleIou), 0, 0, 1, 1},
+  {&__pyx_n_s_isbox, __pyx_k_isbox, sizeof(__pyx_k_isbox), 0, 0, 1, 1},
+  {&__pyx_n_s_iscrowd, __pyx_k_iscrowd, sizeof(__pyx_k_iscrowd), 0, 0, 1, 1},
+  {&__pyx_n_s_isrle, __pyx_k_isrle, sizeof(__pyx_k_isrle), 0, 0, 1, 1},
+  {&__pyx_n_s_j, __pyx_k_j, sizeof(__pyx_k_j), 0, 0, 1, 1},
+  {&__pyx_n_s_len, __pyx_k_len, sizeof(__pyx_k_len), 0, 0, 1, 1},
+  {&__pyx_kp_s_list_input_can_be_bounding_box_N, __pyx_k_list_input_can_be_bounding_box_N, sizeof(__pyx_k_list_input_can_be_bounding_box_N), 0, 0, 1, 0},
+  {&__pyx_n_s_m, __pyx_k_m, sizeof(__pyx_k_m), 0, 0, 1, 1},
+  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+  {&__pyx_n_s_mask, __pyx_k_mask, sizeof(__pyx_k_mask), 0, 0, 1, 1},
+  {&__pyx_n_s_masks, __pyx_k_masks, sizeof(__pyx_k_masks), 0, 0, 1, 1},
+  {&__pyx_n_s_merge, __pyx_k_merge, sizeof(__pyx_k_merge), 0, 0, 1, 1},
+  {&__pyx_n_s_n, __pyx_k_n, sizeof(__pyx_k_n), 0, 0, 1, 1},
+  {&__pyx_kp_u_ndarray_is_not_C_contiguous, __pyx_k_ndarray_is_not_C_contiguous, sizeof(__pyx_k_ndarray_is_not_C_contiguous), 0, 1, 0, 0},
+  {&__pyx_kp_u_ndarray_is_not_Fortran_contiguou, __pyx_k_ndarray_is_not_Fortran_contiguou, sizeof(__pyx_k_ndarray_is_not_Fortran_contiguou), 0, 1, 0, 0},
+  {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
+  {&__pyx_n_s_np_poly, __pyx_k_np_poly, sizeof(__pyx_k_np_poly), 0, 0, 1, 1},
+  {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
+  {&__pyx_kp_s_numpy_core_multiarray_failed_to, __pyx_k_numpy_core_multiarray_failed_to, sizeof(__pyx_k_numpy_core_multiarray_failed_to), 0, 0, 1, 0},
+  {&__pyx_kp_s_numpy_core_umath_failed_to_impor, __pyx_k_numpy_core_umath_failed_to_impor, sizeof(__pyx_k_numpy_core_umath_failed_to_impor), 0, 0, 1, 0},
+  {&__pyx_kp_s_numpy_ndarray_input_is_only_for, __pyx_k_numpy_ndarray_input_is_only_for, sizeof(__pyx_k_numpy_ndarray_input_is_only_for), 0, 0, 1, 0},
+  {&__pyx_n_s_obj, __pyx_k_obj, sizeof(__pyx_k_obj), 0, 0, 1, 1},
+  {&__pyx_n_s_objs, __pyx_k_objs, sizeof(__pyx_k_objs), 0, 0, 1, 1},
+  {&__pyx_n_s_order, __pyx_k_order, sizeof(__pyx_k_order), 0, 0, 1, 1},
+  {&__pyx_n_s_p, __pyx_k_p, sizeof(__pyx_k_p), 0, 0, 1, 1},
+  {&__pyx_n_s_poly, __pyx_k_poly, sizeof(__pyx_k_poly), 0, 0, 1, 1},
+  {&__pyx_n_s_preproc, __pyx_k_preproc, sizeof(__pyx_k_preproc), 0, 0, 1, 1},
+  {&__pyx_n_s_py_string, __pyx_k_py_string, sizeof(__pyx_k_py_string), 0, 0, 1, 1},
+  {&__pyx_n_s_pycocotools__mask, __pyx_k_pycocotools__mask, sizeof(__pyx_k_pycocotools__mask), 0, 0, 1, 1},
+  {&__pyx_n_s_pyiscrowd, __pyx_k_pyiscrowd, sizeof(__pyx_k_pyiscrowd), 0, 0, 1, 1},
+  {&__pyx_n_s_pyobj, __pyx_k_pyobj, sizeof(__pyx_k_pyobj), 0, 0, 1, 1},
+  {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1},
+  {&__pyx_n_s_reshape, __pyx_k_reshape, sizeof(__pyx_k_reshape), 0, 0, 1, 1},
+  {&__pyx_n_s_rleIou, __pyx_k_rleIou, sizeof(__pyx_k_rleIou), 0, 0, 1, 1},
+  {&__pyx_n_s_rleObjs, __pyx_k_rleObjs, sizeof(__pyx_k_rleObjs), 0, 0, 1, 1},
+  {&__pyx_n_s_shape, __pyx_k_shape, sizeof(__pyx_k_shape), 0, 0, 1, 1},
+  {&__pyx_n_s_size, __pyx_k_size, sizeof(__pyx_k_size), 0, 0, 1, 1},
+  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+  {&__pyx_n_s_toBbox, __pyx_k_toBbox, sizeof(__pyx_k_toBbox), 0, 0, 1, 1},
+  {&__pyx_n_s_toString, __pyx_k_toString, sizeof(__pyx_k_toString), 0, 0, 1, 1},
+  {&__pyx_n_s_tsungyi, __pyx_k_tsungyi, sizeof(__pyx_k_tsungyi), 0, 0, 1, 1},
+  {&__pyx_n_s_ucRles, __pyx_k_ucRles, sizeof(__pyx_k_ucRles), 0, 0, 1, 1},
+  {&__pyx_n_s_uint32, __pyx_k_uint32, sizeof(__pyx_k_uint32), 0, 0, 1, 1},
+  {&__pyx_n_s_uint8, __pyx_k_uint8, sizeof(__pyx_k_uint8), 0, 0, 1, 1},
+  {&__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_k_unknown_dtype_code_in_numpy_pxd, sizeof(__pyx_k_unknown_dtype_code_in_numpy_pxd), 0, 1, 0, 0},
+  {&__pyx_kp_s_unrecognized_type_The_following, __pyx_k_unrecognized_type_The_following, sizeof(__pyx_k_unrecognized_type_The_following), 0, 0, 1, 0},
+  {&__pyx_n_s_w, __pyx_k_w, sizeof(__pyx_k_w), 0, 0, 1, 1},
+  {&__pyx_n_s_zeros, __pyx_k_zeros, sizeof(__pyx_k_zeros), 0, 0, 1, 1},
+  {0, 0, 0, 0, 0, 0, 0}
+};
+static int __Pyx_InitCachedBuiltins(void) {
+  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 64, __pyx_L1_error)
+  __pyx_builtin_AttributeError = __Pyx_GetBuiltinName(__pyx_n_s_AttributeError); if (!__pyx_builtin_AttributeError) __PYX_ERR(0, 70, __pyx_L1_error)
+  __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(0, 121, __pyx_L1_error)
+  __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) __PYX_ERR(1, 218, __pyx_L1_error)
+  __pyx_builtin_RuntimeError = __Pyx_GetBuiltinName(__pyx_n_s_RuntimeError); if (!__pyx_builtin_RuntimeError) __PYX_ERR(1, 799, __pyx_L1_error)
+  __pyx_builtin_ImportError = __Pyx_GetBuiltinName(__pyx_n_s_ImportError); if (!__pyx_builtin_ImportError) __PYX_ERR(1, 989, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+
+  /* "pycocotools/_mask.pyx":146
+ * def merge(rleObjs, bint intersect=0):
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)             # <<<<<<<<<<<<<<
+ *     rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+ *     obj = _toString(R)[0]
+ */
+  __pyx_tuple_ = PyTuple_Pack(1, __pyx_int_1); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 146, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple_);
+  __Pyx_GIVEREF(__pyx_tuple_);
+
+  /* "pycocotools/_mask.pyx":172
+ *             # check if it's Nx4 bbox
+ *             if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+ *                 raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')             # <<<<<<<<<<<<<<
+ *             objs = objs.astype(np.double)
+ *         elif type(objs) == list:
+ */
+  __pyx_tuple__2 = PyTuple_Pack(1, __pyx_kp_s_numpy_ndarray_input_is_only_for); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(0, 172, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__2);
+  __Pyx_GIVEREF(__pyx_tuple__2);
+
+  /* "pycocotools/_mask.pyx":185
+ *                 objs = _frString(objs)
+ *             else:
+ *                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')             # <<<<<<<<<<<<<<
+ *         else:
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ */
+  __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_s_list_input_can_be_bounding_box_N); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 185, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__3);
+  __Pyx_GIVEREF(__pyx_tuple__3);
+
+  /* "pycocotools/_mask.pyx":187
+ *                 raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
+ *         else:
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')             # <<<<<<<<<<<<<<
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ */
+  __pyx_tuple__4 = PyTuple_Pack(1, __pyx_kp_s_unrecognized_type_The_following); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__4);
+  __Pyx_GIVEREF(__pyx_tuple__4);
+
+  /* "pycocotools/_mask.pyx":164
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):
+ *     def _preproc(objs):             # <<<<<<<<<<<<<<
+ *         if len(objs) == 0:
+ *             return objs
+ */
+  __pyx_tuple__5 = PyTuple_Pack(4, __pyx_n_s_objs, __pyx_n_s_isbox, __pyx_n_s_isrle, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 164, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__5);
+  __Pyx_GIVEREF(__pyx_tuple__5);
+  __pyx_codeobj__6 = (PyObject*)__Pyx_PyCode_New(1, 0, 4, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__5, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_preproc, 164, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__6)) __PYX_ERR(0, 164, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":189
+ *             raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+ *         return objs
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ */
+  __pyx_tuple__7 = PyTuple_Pack(6, __pyx_n_s_dt, __pyx_n_s_gt, __pyx_n_s_iscrowd, __pyx_n_s_m, __pyx_n_s_n, __pyx_n_s_iou); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__7);
+  __Pyx_GIVEREF(__pyx_tuple__7);
+  __pyx_codeobj__8 = (PyObject*)__Pyx_PyCode_New(6, 0, 6, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__7, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_rleIou, 189, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__8)) __PYX_ERR(0, 189, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":191
+ *     def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+ *         rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):             # <<<<<<<<<<<<<<
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):
+ */
+  __pyx_tuple__9 = PyTuple_Pack(6, __pyx_n_s_dt, __pyx_n_s_gt, __pyx_n_s_iscrowd, __pyx_n_s_m, __pyx_n_s_n, __pyx_n_s_iou); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(0, 191, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__9);
+  __Pyx_GIVEREF(__pyx_tuple__9);
+  __pyx_codeobj__10 = (PyObject*)__Pyx_PyCode_New(6, 0, 6, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__9, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_bbIou, 191, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__10)) __PYX_ERR(0, 191, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":193
+ *     def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+ *         bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+ *     def _len(obj):             # <<<<<<<<<<<<<<
+ *         cdef siz N = 0
+ *         if type(obj) == RLEs:
+ */
+  __pyx_tuple__11 = PyTuple_Pack(2, __pyx_n_s_obj, __pyx_n_s_N); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(0, 193, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__11);
+  __Pyx_GIVEREF(__pyx_tuple__11);
+  __pyx_codeobj__12 = (PyObject*)__Pyx_PyCode_New(1, 0, 2, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__11, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_len, 193, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__12)) __PYX_ERR(0, 193, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":213
+ *         return []
+ *     if not type(dt) == type(gt):
+ *         raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')             # <<<<<<<<<<<<<<
+ * 
+ *     # define local variables
+ */
+  __pyx_tuple__13 = PyTuple_Pack(1, __pyx_kp_s_The_dt_and_gt_should_have_the_sa); if (unlikely(!__pyx_tuple__13)) __PYX_ERR(0, 213, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__13);
+  __Pyx_GIVEREF(__pyx_tuple__13);
+
+  /* "pycocotools/_mask.pyx":224
+ *         _iouFun = _bbIou
+ *     else:
+ *         raise Exception('input data type not allowed.')             # <<<<<<<<<<<<<<
+ *     _iou = <double*> malloc(m*n* sizeof(double))
+ *     iou = np.zeros((m*n, ), dtype=np.double)
+ */
+  __pyx_tuple__14 = PyTuple_Pack(1, __pyx_kp_s_input_data_type_not_allowed); if (unlikely(!__pyx_tuple__14)) __PYX_ERR(0, 224, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__14);
+  __Pyx_GIVEREF(__pyx_tuple__14);
+
+  /* "pycocotools/_mask.pyx":269
+ *     objs = []
+ *     for i in range(n):
+ *         Rs = RLEs(1)             # <<<<<<<<<<<<<<
+ *         cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+ *         # time for malloc can be saved here but it's fine
+ */
+  __pyx_tuple__15 = PyTuple_Pack(1, __pyx_int_1); if (unlikely(!__pyx_tuple__15)) __PYX_ERR(0, 269, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__15);
+  __Pyx_GIVEREF(__pyx_tuple__15);
+
+  /* "pycocotools/_mask.pyx":290
+ *         objs = frUncompressedRLE(pyobj, h, w)
+ *     else:
+ *         raise Exception('input type is not supported.')             # <<<<<<<<<<<<<<
+ *     return objs
+ */
+  __pyx_tuple__16 = PyTuple_Pack(1, __pyx_kp_s_input_type_is_not_supported); if (unlikely(!__pyx_tuple__16)) __PYX_ERR(0, 290, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__16);
+  __Pyx_GIVEREF(__pyx_tuple__16);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":218
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+  __pyx_tuple__17 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_C_contiguous); if (unlikely(!__pyx_tuple__17)) __PYX_ERR(1, 218, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__17);
+  __Pyx_GIVEREF(__pyx_tuple__17);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":222
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             info.buf = PyArray_DATA(self)
+ */
+  __pyx_tuple__18 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_Fortran_contiguou); if (unlikely(!__pyx_tuple__18)) __PYX_ERR(1, 222, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__18);
+  __Pyx_GIVEREF(__pyx_tuple__18);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ */
+  __pyx_tuple__19 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__19)) __PYX_ERR(1, 259, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__19);
+  __Pyx_GIVEREF(__pyx_tuple__19);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":799
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ */
+  __pyx_tuple__20 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor); if (unlikely(!__pyx_tuple__20)) __PYX_ERR(1, 799, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__20);
+  __Pyx_GIVEREF(__pyx_tuple__20);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *             # One could encode it in the format string and have Cython
+ *             # complain instead, BUT: < and > in format strings also imply
+ */
+  __pyx_tuple__21 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__21)) __PYX_ERR(1, 803, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__21);
+  __Pyx_GIVEREF(__pyx_tuple__21);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823
+ *             t = child.type_num
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+  __pyx_tuple__22 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor_2); if (unlikely(!__pyx_tuple__22)) __PYX_ERR(1, 823, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__22);
+  __Pyx_GIVEREF(__pyx_tuple__22);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":989
+ *         _import_array()
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+  __pyx_tuple__23 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_multiarray_failed_to); if (unlikely(!__pyx_tuple__23)) __PYX_ERR(1, 989, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__23);
+  __Pyx_GIVEREF(__pyx_tuple__23);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":995
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+  __pyx_tuple__24 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__24)) __PYX_ERR(1, 995, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__24);
+  __Pyx_GIVEREF(__pyx_tuple__24);
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":1001
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ */
+  __pyx_tuple__25 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__25)) __PYX_ERR(1, 1001, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__25);
+  __Pyx_GIVEREF(__pyx_tuple__25);
+
+  /* "pycocotools/_mask.pyx":100
+ * 
+ * # internal conversion from Python RLEs object to compressed RLE format
+ * def _toString(RLEs Rs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = Rs.n
+ *     cdef bytes py_string
+ */
+  __pyx_tuple__26 = PyTuple_Pack(6, __pyx_n_s_Rs, __pyx_n_s_n, __pyx_n_s_py_string, __pyx_n_s_c_string, __pyx_n_s_objs, __pyx_n_s_i); if (unlikely(!__pyx_tuple__26)) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__26);
+  __Pyx_GIVEREF(__pyx_tuple__26);
+  __pyx_codeobj__27 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__26, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_toString, 100, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__27)) __PYX_ERR(0, 100, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":116
+ * 
+ * # internal conversion from compressed RLE format to Python RLEs object
+ * def _frString(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = len(rleObjs)
+ *     Rs = RLEs(n)
+ */
+  __pyx_tuple__28 = PyTuple_Pack(7, __pyx_n_s_rleObjs, __pyx_n_s_n, __pyx_n_s_Rs, __pyx_n_s_py_string, __pyx_n_s_c_string, __pyx_n_s_i, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__28)) __PYX_ERR(0, 116, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__28);
+  __Pyx_GIVEREF(__pyx_tuple__28);
+  __pyx_codeobj__29 = (PyObject*)__Pyx_PyCode_New(1, 0, 7, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__28, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_frString, 116, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__29)) __PYX_ERR(0, 116, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":129
+ * # encode mask to RLEs objects
+ * # list of RLE string can be generated by RLEs member function
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):             # <<<<<<<<<<<<<<
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)
+ */
+  __pyx_tuple__30 = PyTuple_Pack(6, __pyx_n_s_mask, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_n, __pyx_n_s_Rs, __pyx_n_s_objs); if (unlikely(!__pyx_tuple__30)) __PYX_ERR(0, 129, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__30);
+  __Pyx_GIVEREF(__pyx_tuple__30);
+  __pyx_codeobj__31 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__30, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_encode, 129, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__31)) __PYX_ERR(0, 129, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":137
+ * 
+ * # decode mask from compressed list of RLE string or RLEs object
+ * def decode(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ */
+  __pyx_tuple__32 = PyTuple_Pack(6, __pyx_n_s_rleObjs, __pyx_n_s_Rs, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_n, __pyx_n_s_masks); if (unlikely(!__pyx_tuple__32)) __PYX_ERR(0, 137, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__32);
+  __Pyx_GIVEREF(__pyx_tuple__32);
+  __pyx_codeobj__33 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__32, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_decode, 137, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__33)) __PYX_ERR(0, 137, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":144
+ *     return np.array(masks)
+ * 
+ * def merge(rleObjs, bint intersect=0):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)
+ */
+  __pyx_tuple__34 = PyTuple_Pack(5, __pyx_n_s_rleObjs, __pyx_n_s_intersect, __pyx_n_s_Rs, __pyx_n_s_R, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__34)) __PYX_ERR(0, 144, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__34);
+  __Pyx_GIVEREF(__pyx_tuple__34);
+  __pyx_codeobj__35 = (PyObject*)__Pyx_PyCode_New(2, 0, 5, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__34, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_merge, 144, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__35)) __PYX_ERR(0, 144, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":151
+ *     return obj
+ * 
+ * def area(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ */
+  __pyx_tuple__36 = PyTuple_Pack(5, __pyx_n_s_rleObjs, __pyx_n_s_Rs, __pyx_n_s_a, __pyx_n_s_shape, __pyx_n_s_a_2); if (unlikely(!__pyx_tuple__36)) __PYX_ERR(0, 151, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__36);
+  __Pyx_GIVEREF(__pyx_tuple__36);
+  __pyx_codeobj__37 = (PyObject*)__Pyx_PyCode_New(1, 0, 5, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__36, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_area, 151, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__37)) __PYX_ERR(0, 151, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":163
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):             # <<<<<<<<<<<<<<
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ */
+  __pyx_tuple__38 = PyTuple_Pack(18, __pyx_n_s_dt, __pyx_n_s_gt, __pyx_n_s_pyiscrowd, __pyx_n_s_preproc, __pyx_n_s_preproc, __pyx_n_s_rleIou, __pyx_n_s_rleIou, __pyx_n_s_bbIou, __pyx_n_s_bbIou, __pyx_n_s_len, __pyx_n_s_len, __pyx_n_s_iscrowd, __pyx_n_s_m, __pyx_n_s_n, __pyx_n_s_iou, __pyx_n_s_shape, __pyx_n_s_iouFun, __pyx_n_s_iou_2); if (unlikely(!__pyx_tuple__38)) __PYX_ERR(0, 163, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__38);
+  __Pyx_GIVEREF(__pyx_tuple__38);
+  __pyx_codeobj__39 = (PyObject*)__Pyx_PyCode_New(3, 0, 18, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__38, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_iou_2, 163, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__39)) __PYX_ERR(0, 163, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":233
+ *     return iou.reshape((m,n), order='F')
+ * 
+ * def toBbox( rleObjs ):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n
+ */
+  __pyx_tuple__40 = PyTuple_Pack(6, __pyx_n_s_rleObjs, __pyx_n_s_Rs, __pyx_n_s_n, __pyx_n_s_bb_2, __pyx_n_s_shape, __pyx_n_s_bb); if (unlikely(!__pyx_tuple__40)) __PYX_ERR(0, 233, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__40);
+  __Pyx_GIVEREF(__pyx_tuple__40);
+  __pyx_codeobj__41 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__40, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_toBbox, 233, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__41)) __PYX_ERR(0, 233, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":245
+ *     return bb
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)
+ */
+  __pyx_tuple__42 = PyTuple_Pack(6, __pyx_n_s_bb, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_n, __pyx_n_s_Rs, __pyx_n_s_objs); if (unlikely(!__pyx_tuple__42)) __PYX_ERR(0, 245, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__42);
+  __Pyx_GIVEREF(__pyx_tuple__42);
+  __pyx_codeobj__43 = (PyObject*)__Pyx_PyCode_New(3, 0, 6, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__42, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_frBbox, 245, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__43)) __PYX_ERR(0, 245, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":252
+ *     return objs
+ * 
+ * def frPoly( poly, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)
+ */
+  __pyx_tuple__44 = PyTuple_Pack(9, __pyx_n_s_poly, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_np_poly, __pyx_n_s_n, __pyx_n_s_Rs, __pyx_n_s_i, __pyx_n_s_p, __pyx_n_s_objs); if (unlikely(!__pyx_tuple__44)) __PYX_ERR(0, 252, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__44);
+  __Pyx_GIVEREF(__pyx_tuple__44);
+  __pyx_codeobj__45 = (PyObject*)__Pyx_PyCode_New(3, 0, 9, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__44, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_frPoly, 252, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__45)) __PYX_ERR(0, 252, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":262
+ *     return objs
+ * 
+ * def frUncompressedRLE(ucRles, siz h, siz w):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.uint32_t, ndim=1] cnts
+ *     cdef RLE R
+ */
+  __pyx_tuple__46 = PyTuple_Pack(11, __pyx_n_s_ucRles, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_cnts, __pyx_n_s_R, __pyx_n_s_data, __pyx_n_s_n, __pyx_n_s_objs, __pyx_n_s_i, __pyx_n_s_Rs, __pyx_n_s_j); if (unlikely(!__pyx_tuple__46)) __PYX_ERR(0, 262, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__46);
+  __Pyx_GIVEREF(__pyx_tuple__46);
+  __pyx_codeobj__47 = (PyObject*)__Pyx_PyCode_New(3, 0, 11, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__46, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_frUncompressedRLE, 262, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__47)) __PYX_ERR(0, 262, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":280
+ *     return objs
+ * 
+ * def frPyObjects(pyobj, siz h, w):             # <<<<<<<<<<<<<<
+ *     if type(pyobj) == np.ndarray:
+ *         objs = frBbox(pyobj, h, w )
+ */
+  __pyx_tuple__48 = PyTuple_Pack(4, __pyx_n_s_pyobj, __pyx_n_s_h, __pyx_n_s_w, __pyx_n_s_objs); if (unlikely(!__pyx_tuple__48)) __PYX_ERR(0, 280, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__48);
+  __Pyx_GIVEREF(__pyx_tuple__48);
+  __pyx_codeobj__49 = (PyObject*)__Pyx_PyCode_New(3, 0, 4, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__48, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_home_ftian_code_py_faster_rcnn, __pyx_n_s_frPyObjects, 280, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__49)) __PYX_ERR(0, 280, __pyx_L1_error)
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_InitGlobals(void) {
+  if (__Pyx_InitStrings(__pyx_string_tab) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_1 = PyInt_FromLong(1); if (unlikely(!__pyx_int_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_int_4 = PyInt_FromLong(4); if (unlikely(!__pyx_int_4)) __PYX_ERR(0, 1, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC init_mask(void); /*proto*/
+PyMODINIT_FUNC init_mask(void)
+#else
+PyMODINIT_FUNC PyInit__mask(void); /*proto*/
+PyMODINIT_FUNC PyInit__mask(void)
+#endif
+{
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_t_2;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_REFNANNY
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+  if (!__Pyx_RefNanny) {
+      PyErr_Clear();
+      __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+      if (!__Pyx_RefNanny)
+          Py_FatalError("failed to import 'refnanny' module");
+  }
+  #endif
+  __Pyx_RefNannySetupContext("PyMODINIT_FUNC PyInit__mask(void)", 0);
+  if (__Pyx_check_binary_version() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pyx_CyFunction_USED
+  if (__pyx_CyFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  if (__pyx_Coroutine_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_StopAsyncIteration_USED
+  if (__pyx_StopAsyncIteration_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  #ifdef WITH_THREAD /* Python build with threading support? */
+  PyEval_InitThreads();
+  #endif
+  #endif
+  /*--- Module creation code ---*/
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4("_mask", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  #endif
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_d);
+  __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if CYTHON_COMPILING_IN_PYPY
+  Py_INCREF(__pyx_b);
+  #endif
+  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  /*--- Initialize various global constants etc. ---*/
+  if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  if (__pyx_module_is_main_pycocotools___mask) {
+    if (PyObject_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error)
+    if (!PyDict_GetItemString(modules, "pycocotools._mask")) {
+      if (unlikely(PyDict_SetItemString(modules, "pycocotools._mask", __pyx_m) < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Constants init code ---*/
+  if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Global init code ---*/
+  /*--- Variable export code ---*/
+  /*--- Function export code ---*/
+  /*--- Type init code ---*/
+  if (PyType_Ready(&__pyx_type_11pycocotools_5_mask_RLEs) < 0) __PYX_ERR(0, 53, __pyx_L1_error)
+  __pyx_type_11pycocotools_5_mask_RLEs.tp_print = 0;
+  if (PyObject_SetAttrString(__pyx_m, "RLEs", (PyObject *)&__pyx_type_11pycocotools_5_mask_RLEs) < 0) __PYX_ERR(0, 53, __pyx_L1_error)
+  __pyx_ptype_11pycocotools_5_mask_RLEs = &__pyx_type_11pycocotools_5_mask_RLEs;
+  if (PyType_Ready(&__pyx_type_11pycocotools_5_mask_Masks) < 0) __PYX_ERR(0, 74, __pyx_L1_error)
+  __pyx_type_11pycocotools_5_mask_Masks.tp_print = 0;
+  if (PyObject_SetAttrString(__pyx_m, "Masks", (PyObject *)&__pyx_type_11pycocotools_5_mask_Masks) < 0) __PYX_ERR(0, 74, __pyx_L1_error)
+  __pyx_ptype_11pycocotools_5_mask_Masks = &__pyx_type_11pycocotools_5_mask_Masks;
+  /*--- Type import code ---*/
+  __pyx_ptype_7cpython_4type_type = __Pyx_ImportType(__Pyx_BUILTIN_MODULE_NAME, "type", 
+  #if CYTHON_COMPILING_IN_PYPY
+  sizeof(PyTypeObject),
+  #else
+  sizeof(PyHeapTypeObject),
+  #endif
+  0); if (unlikely(!__pyx_ptype_7cpython_4type_type)) __PYX_ERR(2, 9, __pyx_L1_error)
+  __pyx_ptype_5numpy_dtype = __Pyx_ImportType("numpy", "dtype", sizeof(PyArray_Descr), 0); if (unlikely(!__pyx_ptype_5numpy_dtype)) __PYX_ERR(1, 155, __pyx_L1_error)
+  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType("numpy", "flatiter", sizeof(PyArrayIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_flatiter)) __PYX_ERR(1, 168, __pyx_L1_error)
+  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType("numpy", "broadcast", sizeof(PyArrayMultiIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_broadcast)) __PYX_ERR(1, 172, __pyx_L1_error)
+  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType("numpy", "ndarray", sizeof(PyArrayObject), 0); if (unlikely(!__pyx_ptype_5numpy_ndarray)) __PYX_ERR(1, 181, __pyx_L1_error)
+  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType("numpy", "ufunc", sizeof(PyUFuncObject), 0); if (unlikely(!__pyx_ptype_5numpy_ufunc)) __PYX_ERR(1, 861, __pyx_L1_error)
+  /*--- Variable import code ---*/
+  /*--- Function import code ---*/
+  /*--- Execution code ---*/
+  #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED)
+  if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+
+  /* "pycocotools/_mask.pyx":11
+ * #**************************************************************************
+ * 
+ * __author__ = 'tsungyi'             # <<<<<<<<<<<<<<
+ * 
+ * # import both Python-level and C-level symbols of Numpy
+ */
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_author, __pyx_n_s_tsungyi) < 0) __PYX_ERR(0, 11, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":15
+ * # import both Python-level and C-level symbols of Numpy
+ * # the API uses Numpy to interface C and Python
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * cimport numpy as np
+ * from libc.stdlib cimport malloc, free
+ */
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 15, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 15, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":20
+ * 
+ * # intialized Numpy. must do.
+ * np.import_array()             # <<<<<<<<<<<<<<
+ * 
+ * # import numpy C function
+ */
+  __pyx_t_2 = __pyx_f_5numpy_import_array(); if (unlikely(__pyx_t_2 == -1)) __PYX_ERR(0, 20, __pyx_L1_error)
+
+  /* "pycocotools/_mask.pyx":100
+ * 
+ * # internal conversion from Python RLEs object to compressed RLE format
+ * def _toString(RLEs Rs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = Rs.n
+ *     cdef bytes py_string
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_1_toString, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_toString, __pyx_t_1) < 0) __PYX_ERR(0, 100, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":116
+ * 
+ * # internal conversion from compressed RLE format to Python RLEs object
+ * def _frString(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef siz n = len(rleObjs)
+ *     Rs = RLEs(n)
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_3_frString, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 116, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_frString, __pyx_t_1) < 0) __PYX_ERR(0, 116, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":129
+ * # encode mask to RLEs objects
+ * # list of RLE string can be generated by RLEs member function
+ * def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):             # <<<<<<<<<<<<<<
+ *     h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+ *     cdef RLEs Rs = RLEs(n)
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_5encode, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 129, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_encode, __pyx_t_1) < 0) __PYX_ERR(0, 129, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":137
+ * 
+ * # decode mask from compressed list of RLE string or RLEs object
+ * def decode(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_7decode, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 137, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_decode, __pyx_t_1) < 0) __PYX_ERR(0, 137, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":144
+ *     return np.array(masks)
+ * 
+ * def merge(rleObjs, bint intersect=0):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef RLEs R = RLEs(1)
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_9merge, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 144, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_merge, __pyx_t_1) < 0) __PYX_ERR(0, 144, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":151
+ *     return obj
+ * 
+ * def area(rleObjs):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_11area, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 151, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_area, __pyx_t_1) < 0) __PYX_ERR(0, 151, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":163
+ * 
+ * # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+ * def iou( dt, gt, pyiscrowd ):             # <<<<<<<<<<<<<<
+ *     def _preproc(objs):
+ *         if len(objs) == 0:
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_13iou, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 163, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_iou_2, __pyx_t_1) < 0) __PYX_ERR(0, 163, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":233
+ *     return iou.reshape((m,n), order='F')
+ * 
+ * def toBbox( rleObjs ):             # <<<<<<<<<<<<<<
+ *     cdef RLEs Rs = _frString(rleObjs)
+ *     cdef siz n = Rs.n
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_15toBbox, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 233, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_toBbox, __pyx_t_1) < 0) __PYX_ERR(0, 233, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":245
+ *     return bb
+ * 
+ * def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef siz n = bb.shape[0]
+ *     Rs = RLEs(n)
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_17frBbox, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 245, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_frBbox, __pyx_t_1) < 0) __PYX_ERR(0, 245, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":252
+ *     return objs
+ * 
+ * def frPoly( poly, siz h, siz w ):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.double_t, ndim=1] np_poly
+ *     n = len(poly)
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_19frPoly, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 252, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_frPoly, __pyx_t_1) < 0) __PYX_ERR(0, 252, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":262
+ *     return objs
+ * 
+ * def frUncompressedRLE(ucRles, siz h, siz w):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.uint32_t, ndim=1] cnts
+ *     cdef RLE R
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_21frUncompressedRLE, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 262, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_frUncompressedRLE, __pyx_t_1) < 0) __PYX_ERR(0, 262, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":280
+ *     return objs
+ * 
+ * def frPyObjects(pyobj, siz h, w):             # <<<<<<<<<<<<<<
+ *     if type(pyobj) == np.ndarray:
+ *         objs = frBbox(pyobj, h, w )
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_11pycocotools_5_mask_23frPyObjects, NULL, __pyx_n_s_pycocotools__mask); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 280, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_frPyObjects, __pyx_t_1) < 0) __PYX_ERR(0, 280, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "pycocotools/_mask.pyx":1
+ * # distutils: language = c             # <<<<<<<<<<<<<<
+ * # distutils: sources = ../MatlabAPI/private/maskApi.c
+ * 
+ */
+  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "../../../.local/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":997
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /*--- Wrapped vars code ---*/
+
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  if (__pyx_m) {
+    if (__pyx_d) {
+      __Pyx_AddTraceback("init pycocotools._mask", __pyx_clineno, __pyx_lineno, __pyx_filename);
+    }
+    Py_DECREF(__pyx_m); __pyx_m = 0;
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init pycocotools._mask");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if PY_MAJOR_VERSION < 3
+  return;
+  #else
+  return __pyx_m;
+  #endif
+}
+
+/* --- Runtime support code --- */
+/* Refnanny */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule((char *)modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, (char *)"RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif
+
+/* GetBuiltinName */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
+    PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name);
+    if (unlikely(!result)) {
+        PyErr_Format(PyExc_NameError,
+#if PY_MAJOR_VERSION >= 3
+            "name '%U' is not defined", name);
+#else
+            "name '%.200s' is not defined", PyString_AS_STRING(name));
+#endif
+    }
+    return result;
+}
+
+/* RaiseDoubleKeywords */
+static void __Pyx_RaiseDoubleKeywordsError(
+    const char* func_name,
+    PyObject* kw_name)
+{
+    PyErr_Format(PyExc_TypeError,
+        #if PY_MAJOR_VERSION >= 3
+        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
+        #else
+        "%s() got multiple values for keyword argument '%s'", func_name,
+        PyString_AsString(kw_name));
+        #endif
+}
+
+/* ParseKeywords */
+static int __Pyx_ParseOptionalKeywords(
+    PyObject *kwds,
+    PyObject **argnames[],
+    PyObject *kwds2,
+    PyObject *values[],
+    Py_ssize_t num_pos_args,
+    const char* function_name)
+{
+    PyObject *key = 0, *value = 0;
+    Py_ssize_t pos = 0;
+    PyObject*** name;
+    PyObject*** first_kw_arg = argnames + num_pos_args;
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        name = first_kw_arg;
+        while (*name && (**name != key)) name++;
+        if (*name) {
+            values[name-argnames] = value;
+            continue;
+        }
+        name = first_kw_arg;
+        #if PY_MAJOR_VERSION < 3
+        if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) {
+            while (*name) {
+                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
+                        && _PyString_Eq(**name, key)) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    if ((**argname == key) || (
+                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
+                             && _PyString_Eq(**argname, key))) {
+                        goto arg_passed_twice;
+                    }
+                    argname++;
+                }
+            }
+        } else
+        #endif
+        if (likely(PyUnicode_Check(key))) {
+            while (*name) {
+                int cmp = (**name == key) ? 0 :
+                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                    (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :
+                #endif
+                    PyUnicode_Compare(**name, key);
+                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                if (cmp == 0) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    int cmp = (**argname == key) ? 0 :
+                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                        (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :
+                    #endif
+                        PyUnicode_Compare(**argname, key);
+                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                    if (cmp == 0) goto arg_passed_twice;
+                    argname++;
+                }
+            }
+        } else
+            goto invalid_keyword_type;
+        if (kwds2) {
+            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+        } else {
+            goto invalid_keyword;
+        }
+    }
+    return 0;
+arg_passed_twice:
+    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    goto bad;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    goto bad;
+invalid_keyword:
+    PyErr_Format(PyExc_TypeError,
+    #if PY_MAJOR_VERSION < 3
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+bad:
+    return -1;
+}
+
+/* RaiseArgTupleInvalid */
+static void __Pyx_RaiseArgtupleInvalid(
+    const char* func_name,
+    int exact,
+    Py_ssize_t num_min,
+    Py_ssize_t num_max,
+    Py_ssize_t num_found)
+{
+    Py_ssize_t num_expected;
+    const char *more_or_less;
+    if (num_found < num_min) {
+        num_expected = num_min;
+        more_or_less = "at least";
+    } else {
+        num_expected = num_max;
+        more_or_less = "at most";
+    }
+    if (exact) {
+        more_or_less = "exactly";
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 func_name, more_or_less, num_expected,
+                 (num_expected == 1) ? "" : "s", num_found);
+}
+
+/* BytesEquals */
+static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals) {
+#if CYTHON_COMPILING_IN_PYPY
+    return PyObject_RichCompareBool(s1, s2, equals);
+#else
+    if (s1 == s2) {
+        return (equals == Py_EQ);
+    } else if (PyBytes_CheckExact(s1) & PyBytes_CheckExact(s2)) {
+        const char *ps1, *ps2;
+        Py_ssize_t length = PyBytes_GET_SIZE(s1);
+        if (length != PyBytes_GET_SIZE(s2))
+            return (equals == Py_NE);
+        ps1 = PyBytes_AS_STRING(s1);
+        ps2 = PyBytes_AS_STRING(s2);
+        if (ps1[0] != ps2[0]) {
+            return (equals == Py_NE);
+        } else if (length == 1) {
+            return (equals == Py_EQ);
+        } else {
+            int result = memcmp(ps1, ps2, (size_t)length);
+            return (equals == Py_EQ) ? (result == 0) : (result != 0);
+        }
+    } else if ((s1 == Py_None) & PyBytes_CheckExact(s2)) {
+        return (equals == Py_NE);
+    } else if ((s2 == Py_None) & PyBytes_CheckExact(s1)) {
+        return (equals == Py_NE);
+    } else {
+        int result;
+        PyObject* py_result = PyObject_RichCompare(s1, s2, equals);
+        if (!py_result)
+            return -1;
+        result = __Pyx_PyObject_IsTrue(py_result);
+        Py_DECREF(py_result);
+        return result;
+    }
+#endif
+}
+
+/* UnicodeEquals */
+static CYTHON_INLINE int __Pyx_PyUnicode_Equals(PyObject* s1, PyObject* s2, int equals) {
+#if CYTHON_COMPILING_IN_PYPY
+    return PyObject_RichCompareBool(s1, s2, equals);
+#else
+#if PY_MAJOR_VERSION < 3
+    PyObject* owned_ref = NULL;
+#endif
+    int s1_is_unicode, s2_is_unicode;
+    if (s1 == s2) {
+        goto return_eq;
+    }
+    s1_is_unicode = PyUnicode_CheckExact(s1);
+    s2_is_unicode = PyUnicode_CheckExact(s2);
+#if PY_MAJOR_VERSION < 3
+    if ((s1_is_unicode & (!s2_is_unicode)) && PyString_CheckExact(s2)) {
+        owned_ref = PyUnicode_FromObject(s2);
+        if (unlikely(!owned_ref))
+            return -1;
+        s2 = owned_ref;
+        s2_is_unicode = 1;
+    } else if ((s2_is_unicode & (!s1_is_unicode)) && PyString_CheckExact(s1)) {
+        owned_ref = PyUnicode_FromObject(s1);
+        if (unlikely(!owned_ref))
+            return -1;
+        s1 = owned_ref;
+        s1_is_unicode = 1;
+    } else if (((!s2_is_unicode) & (!s1_is_unicode))) {
+        return __Pyx_PyBytes_Equals(s1, s2, equals);
+    }
+#endif
+    if (s1_is_unicode & s2_is_unicode) {
+        Py_ssize_t length;
+        int kind;
+        void *data1, *data2;
+        if (unlikely(__Pyx_PyUnicode_READY(s1) < 0) || unlikely(__Pyx_PyUnicode_READY(s2) < 0))
+            return -1;
+        length = __Pyx_PyUnicode_GET_LENGTH(s1);
+        if (length != __Pyx_PyUnicode_GET_LENGTH(s2)) {
+            goto return_ne;
+        }
+        kind = __Pyx_PyUnicode_KIND(s1);
+        if (kind != __Pyx_PyUnicode_KIND(s2)) {
+            goto return_ne;
+        }
+        data1 = __Pyx_PyUnicode_DATA(s1);
+        data2 = __Pyx_PyUnicode_DATA(s2);
+        if (__Pyx_PyUnicode_READ(kind, data1, 0) != __Pyx_PyUnicode_READ(kind, data2, 0)) {
+            goto return_ne;
+        } else if (length == 1) {
+            goto return_eq;
+        } else {
+            int result = memcmp(data1, data2, (size_t)(length * kind));
+            #if PY_MAJOR_VERSION < 3
+            Py_XDECREF(owned_ref);
+            #endif
+            return (equals == Py_EQ) ? (result == 0) : (result != 0);
+        }
+    } else if ((s1 == Py_None) & s2_is_unicode) {
+        goto return_ne;
+    } else if ((s2 == Py_None) & s1_is_unicode) {
+        goto return_ne;
+    } else {
+        int result;
+        PyObject* py_result = PyObject_RichCompare(s1, s2, equals);
+        if (!py_result)
+            return -1;
+        result = __Pyx_PyObject_IsTrue(py_result);
+        Py_DECREF(py_result);
+        return result;
+    }
+return_eq:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(owned_ref);
+    #endif
+    return (equals == Py_EQ);
+return_ne:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(owned_ref);
+    #endif
+    return (equals == Py_NE);
+#endif
+}
+
+/* PyObjectCall */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *result;
+    ternaryfunc call = func->ob_type->tp_call;
+    if (unlikely(!call))
+        return PyObject_Call(func, arg, kw);
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    result = (*call)(func, arg, kw);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyErrFetchRestore */
+#if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+}
+#endif
+
+/* RaiseException */
+#if PY_MAJOR_VERSION < 3
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
+                        CYTHON_UNUSED PyObject *cause) {
+    __Pyx_PyThreadState_declare
+    Py_XINCREF(type);
+    if (!value || value == Py_None)
+        value = NULL;
+    else
+        Py_INCREF(value);
+    if (!tb || tb == Py_None)
+        tb = NULL;
+    else {
+        Py_INCREF(tb);
+        if (!PyTraceBack_Check(tb)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: arg 3 must be a traceback or None");
+            goto raise_error;
+        }
+    }
+    if (PyType_Check(type)) {
+#if CYTHON_COMPILING_IN_PYPY
+        if (!value) {
+            Py_INCREF(Py_None);
+            value = Py_None;
+        }
+#endif
+        PyErr_NormalizeException(&type, &value, &tb);
+    } else {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto raise_error;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(type);
+        Py_INCREF(type);
+        if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception class must be a subclass of BaseException");
+            goto raise_error;
+        }
+    }
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrRestore(type, value, tb);
+    return;
+raise_error:
+    Py_XDECREF(value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+    return;
+}
+#else
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    PyObject* owned_instance = NULL;
+    if (tb == Py_None) {
+        tb = 0;
+    } else if (tb && !PyTraceBack_Check(tb)) {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: arg 3 must be a traceback or None");
+        goto bad;
+    }
+    if (value == Py_None)
+        value = 0;
+    if (PyExceptionInstance_Check(type)) {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto bad;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(value);
+    } else if (PyExceptionClass_Check(type)) {
+        PyObject *instance_class = NULL;
+        if (value && PyExceptionInstance_Check(value)) {
+            instance_class = (PyObject*) Py_TYPE(value);
+            if (instance_class != type) {
+                int is_subclass = PyObject_IsSubclass(instance_class, type);
+                if (!is_subclass) {
+                    instance_class = NULL;
+                } else if (unlikely(is_subclass == -1)) {
+                    goto bad;
+                } else {
+                    type = instance_class;
+                }
+            }
+        }
+        if (!instance_class) {
+            PyObject *args;
+            if (!value)
+                args = PyTuple_New(0);
+            else if (PyTuple_Check(value)) {
+                Py_INCREF(value);
+                args = value;
+            } else
+                args = PyTuple_Pack(1, value);
+            if (!args)
+                goto bad;
+            owned_instance = PyObject_Call(type, args, NULL);
+            Py_DECREF(args);
+            if (!owned_instance)
+                goto bad;
+            value = owned_instance;
+            if (!PyExceptionInstance_Check(value)) {
+                PyErr_Format(PyExc_TypeError,
+                             "calling %R should have returned an instance of "
+                             "BaseException, not %R",
+                             type, Py_TYPE(value));
+                goto bad;
+            }
+        }
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: exception class must be a subclass of BaseException");
+        goto bad;
+    }
+#if PY_VERSION_HEX >= 0x03030000
+    if (cause) {
+#else
+    if (cause && cause != Py_None) {
+#endif
+        PyObject *fixed_cause;
+        if (cause == Py_None) {
+            fixed_cause = NULL;
+        } else if (PyExceptionClass_Check(cause)) {
+            fixed_cause = PyObject_CallObject(cause, NULL);
+            if (fixed_cause == NULL)
+                goto bad;
+        } else if (PyExceptionInstance_Check(cause)) {
+            fixed_cause = cause;
+            Py_INCREF(fixed_cause);
+        } else {
+            PyErr_SetString(PyExc_TypeError,
+                            "exception causes must derive from "
+                            "BaseException");
+            goto bad;
+        }
+        PyException_SetCause(value, fixed_cause);
+    }
+    PyErr_SetObject(type, value);
+    if (tb) {
+#if CYTHON_COMPILING_IN_PYPY
+        PyObject *tmp_type, *tmp_value, *tmp_tb;
+        PyErr_Fetch(&tmp_type, &tmp_value, &tmp_tb);
+        Py_INCREF(tb);
+        PyErr_Restore(tmp_type, tmp_value, tb);
+        Py_XDECREF(tmp_tb);
+#else
+        PyThreadState *tstate = PyThreadState_GET();
+        PyObject* tmp_tb = tstate->curexc_traceback;
+        if (tb != tmp_tb) {
+            Py_INCREF(tb);
+            tstate->curexc_traceback = tb;
+            Py_XDECREF(tmp_tb);
+        }
+#endif
+    }
+bad:
+    Py_XDECREF(owned_instance);
+    return;
+}
+#endif
+
+/* ExtTypeTest */
+  static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (likely(PyObject_TypeCheck(obj, type)))
+        return 1;
+    PyErr_Format(PyExc_TypeError, "Cannot convert %.200s to %.200s",
+                 Py_TYPE(obj)->tp_name, type->tp_name);
+    return 0;
+}
+
+/* ArgTypeTest */
+  static void __Pyx_RaiseArgumentTypeInvalid(const char* name, PyObject *obj, PyTypeObject *type) {
+    PyErr_Format(PyExc_TypeError,
+        "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)",
+        name, type->tp_name, Py_TYPE(obj)->tp_name);
+}
+static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed,
+    const char *name, int exact)
+{
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (none_allowed && obj == Py_None) return 1;
+    else if (exact) {
+        if (likely(Py_TYPE(obj) == type)) return 1;
+        #if PY_MAJOR_VERSION == 2
+        else if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1;
+        #endif
+    }
+    else {
+        if (likely(PyObject_TypeCheck(obj, type))) return 1;
+    }
+    __Pyx_RaiseArgumentTypeInvalid(name, obj, type);
+    return 0;
+}
+
+/* PyIntBinop */
+  #if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED long intval, CYTHON_UNUSED int inplace) {
+    #if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_CheckExact(op1))) {
+        const long b = intval;
+        long x;
+        long a = PyInt_AS_LONG(op1);
+            x = (long)((unsigned long)a + b);
+            if (likely((x^a) >= 0 || (x^b) >= 0))
+                return PyInt_FromLong(x);
+            return PyLong_Type.tp_as_number->nb_add(op1, op2);
+    }
+    #endif
+    #if CYTHON_USE_PYLONG_INTERNALS
+    if (likely(PyLong_CheckExact(op1))) {
+        const long b = intval;
+        long a, x;
+#ifdef HAVE_LONG_LONG
+        const PY_LONG_LONG llb = intval;
+        PY_LONG_LONG lla, llx;
+#endif
+        const digit* digits = ((PyLongObject*)op1)->ob_digit;
+        const Py_ssize_t size = Py_SIZE(op1);
+        if (likely(__Pyx_sst_abs(size) <= 1)) {
+            a = likely(size) ? digits[0] : 0;
+            if (size == -1) a = -a;
+        } else {
+            switch (size) {
+                case -2:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        a = -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 2 * PyLong_SHIFT) {
+                        lla = -(PY_LONG_LONG) (((((unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                case 2:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        a = (long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 2 * PyLong_SHIFT) {
+                        lla = (PY_LONG_LONG) (((((unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                case -3:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        a = -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 3 * PyLong_SHIFT) {
+                        lla = -(PY_LONG_LONG) (((((((unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                case 3:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        a = (long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 3 * PyLong_SHIFT) {
+                        lla = (PY_LONG_LONG) (((((((unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                case -4:
+                    if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                        a = -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 4 * PyLong_SHIFT) {
+                        lla = -(PY_LONG_LONG) (((((((((unsigned PY_LONG_LONG)digits[3]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                case 4:
+                    if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                        a = (long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+#ifdef HAVE_LONG_LONG
+                    } else if (8 * sizeof(PY_LONG_LONG) - 1 > 4 * PyLong_SHIFT) {
+                        lla = (PY_LONG_LONG) (((((((((unsigned PY_LONG_LONG)digits[3]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[2]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[1]) << PyLong_SHIFT) | (unsigned PY_LONG_LONG)digits[0]));
+                        goto long_long;
+#endif
+                    }
+                default: return PyLong_Type.tp_as_number->nb_add(op1, op2);
+            }
+        }
+                x = a + b;
+            return PyLong_FromLong(x);
+#ifdef HAVE_LONG_LONG
+        long_long:
+                llx = lla + llb;
+            return PyLong_FromLongLong(llx);
+#endif
+        
+        
+    }
+    #endif
+    if (PyFloat_CheckExact(op1)) {
+        const long b = intval;
+        double a = PyFloat_AS_DOUBLE(op1);
+            double result;
+            PyFPE_START_PROTECT("add", return NULL)
+            result = ((double)a) + (double)b;
+            PyFPE_END_PROTECT(result)
+            return PyFloat_FromDouble(result);
+    }
+    return (inplace ? PyNumber_InPlaceAdd : PyNumber_Add)(op1, op2);
+}
+#endif
+
+/* GetItemInt */
+  static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) {
+    PyObject *r;
+    if (!j) return NULL;
+    r = PyObject_GetItem(o, j);
+    Py_DECREF(j);
+    return r;
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+                                                              CYTHON_NCP_UNUSED int wraparound,
+                                                              CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    if (wraparound & unlikely(i < 0)) i += PyList_GET_SIZE(o);
+    if ((!boundscheck) || likely((0 <= i) & (i < PyList_GET_SIZE(o)))) {
+        PyObject *r = PyList_GET_ITEM(o, i);
+        Py_INCREF(r);
+        return r;
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+#else
+    return PySequence_GetItem(o, i);
+#endif
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+                                                              CYTHON_NCP_UNUSED int wraparound,
+                                                              CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    if (wraparound & unlikely(i < 0)) i += PyTuple_GET_SIZE(o);
+    if ((!boundscheck) || likely((0 <= i) & (i < PyTuple_GET_SIZE(o)))) {
+        PyObject *r = PyTuple_GET_ITEM(o, i);
+        Py_INCREF(r);
+        return r;
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+#else
+    return PySequence_GetItem(o, i);
+#endif
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, int is_list,
+                                                     CYTHON_NCP_UNUSED int wraparound,
+                                                     CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS && CYTHON_USE_TYPE_SLOTS
+    if (is_list || PyList_CheckExact(o)) {
+        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyList_GET_SIZE(o);
+        if ((!boundscheck) || (likely((n >= 0) & (n < PyList_GET_SIZE(o))))) {
+            PyObject *r = PyList_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    }
+    else if (PyTuple_CheckExact(o)) {
+        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyTuple_GET_SIZE(o);
+        if ((!boundscheck) || likely((n >= 0) & (n < PyTuple_GET_SIZE(o)))) {
+            PyObject *r = PyTuple_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    } else {
+        PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence;
+        if (likely(m && m->sq_item)) {
+            if (wraparound && unlikely(i < 0) && likely(m->sq_length)) {
+                Py_ssize_t l = m->sq_length(o);
+                if (likely(l >= 0)) {
+                    i += l;
+                } else {
+                    if (!PyErr_ExceptionMatches(PyExc_OverflowError))
+                        return NULL;
+                    PyErr_Clear();
+                }
+            }
+            return m->sq_item(o, i);
+        }
+    }
+#else
+    if (is_list || PySequence_Check(o)) {
+        return PySequence_GetItem(o, i);
+    }
+#endif
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+}
+
+/* BufferFormatCheck */
+  static CYTHON_INLINE int __Pyx_IsLittleEndian(void) {
+  unsigned int n = 1;
+  return *(unsigned char*)(&n) != 0;
+}
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type) {
+  stack[0].field = &ctx->root;
+  stack[0].parent_offset = 0;
+  ctx->root.type = type;
+  ctx->root.name = "buffer dtype";
+  ctx->root.offset = 0;
+  ctx->head = stack;
+  ctx->head->field = &ctx->root;
+  ctx->fmt_offset = 0;
+  ctx->head->parent_offset = 0;
+  ctx->new_packmode = '@';
+  ctx->enc_packmode = '@';
+  ctx->new_count = 1;
+  ctx->enc_count = 0;
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  ctx->is_valid_array = 0;
+  ctx->struct_alignment = 0;
+  while (type->typegroup == 'S') {
+    ++ctx->head;
+    ctx->head->field = type->fields;
+    ctx->head->parent_offset = 0;
+    type = type->fields->type;
+  }
+}
+static int __Pyx_BufFmt_ParseNumber(const char** ts) {
+    int count;
+    const char* t = *ts;
+    if (*t < '0' || *t > '9') {
+      return -1;
+    } else {
+        count = *t++ - '0';
+        while (*t >= '0' && *t < '9') {
+            count *= 10;
+            count += *t++ - '0';
+        }
+    }
+    *ts = t;
+    return count;
+}
+static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
+    int number = __Pyx_BufFmt_ParseNumber(ts);
+    if (number == -1)
+        PyErr_Format(PyExc_ValueError,\
+                     "Does not understand character buffer dtype format string ('%c')", **ts);
+    return number;
+}
+static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
+  PyErr_Format(PyExc_ValueError,
+               "Unexpected format string character: '%c'", ch);
+}
+static const char* __Pyx_BufFmt_DescribeTypeChar(char ch, int is_complex) {
+  switch (ch) {
+    case 'c': return "'char'";
+    case 'b': return "'signed char'";
+    case 'B': return "'unsigned char'";
+    case 'h': return "'short'";
+    case 'H': return "'unsigned short'";
+    case 'i': return "'int'";
+    case 'I': return "'unsigned int'";
+    case 'l': return "'long'";
+    case 'L': return "'unsigned long'";
+    case 'q': return "'long long'";
+    case 'Q': return "'unsigned long long'";
+    case 'f': return (is_complex ? "'complex float'" : "'float'");
+    case 'd': return (is_complex ? "'complex double'" : "'double'");
+    case 'g': return (is_complex ? "'complex long double'" : "'long double'");
+    case 'T': return "a struct";
+    case 'O': return "Python object";
+    case 'P': return "a pointer";
+    case 's': case 'p': return "a string";
+    case 0: return "end";
+    default: return "unparseable format string";
+  }
+}
+static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return 2;
+    case 'i': case 'I': case 'l': case 'L': return 4;
+    case 'q': case 'Q': return 8;
+    case 'f': return (is_complex ? 8 : 4);
+    case 'd': return (is_complex ? 16 : 8);
+    case 'g': {
+      PyErr_SetString(PyExc_ValueError, "Python does not define a standard format string size for long double ('g')..");
+      return 0;
+    }
+    case 'O': case 'P': return sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
+  switch (ch) {
+    case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(short);
+    case 'i': case 'I': return sizeof(int);
+    case 'l': case 'L': return sizeof(long);
+    #ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(PY_LONG_LONG);
+    #endif
+    case 'f': return sizeof(float) * (is_complex ? 2 : 1);
+    case 'd': return sizeof(double) * (is_complex ? 2 : 1);
+    case 'g': return sizeof(long double) * (is_complex ? 2 : 1);
+    case 'O': case 'P': return sizeof(void*);
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+typedef struct { char c; short x; } __Pyx_st_short;
+typedef struct { char c; int x; } __Pyx_st_int;
+typedef struct { char c; long x; } __Pyx_st_long;
+typedef struct { char c; float x; } __Pyx_st_float;
+typedef struct { char c; double x; } __Pyx_st_double;
+typedef struct { char c; long double x; } __Pyx_st_longdouble;
+typedef struct { char c; void *x; } __Pyx_st_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_st_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_st_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_st_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_st_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_st_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_st_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_st_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_st_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+/* These are for computing the padding at the end of the struct to align
+   on the first member of the struct. This will probably the same as above,
+   but we don't have any guarantees.
+ */
+typedef struct { short x; char c; } __Pyx_pad_short;
+typedef struct { int x; char c; } __Pyx_pad_int;
+typedef struct { long x; char c; } __Pyx_pad_long;
+typedef struct { float x; char c; } __Pyx_pad_float;
+typedef struct { double x; char c; } __Pyx_pad_double;
+typedef struct { long double x; char c; } __Pyx_pad_longdouble;
+typedef struct { void *x; char c; } __Pyx_pad_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_pad_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_pad_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_pad_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_pad_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_pad_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_pad_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_pad_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_pad_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
+  switch (ch) {
+    case 'c':
+        return 'H';
+    case 'b': case 'h': case 'i':
+    case 'l': case 'q': case 's': case 'p':
+        return 'I';
+    case 'B': case 'H': case 'I': case 'L': case 'Q':
+        return 'U';
+    case 'f': case 'd': case 'g':
+        return (is_complex ? 'C' : 'R');
+    case 'O':
+        return 'O';
+    case 'P':
+        return 'P';
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
+  if (ctx->head == NULL || ctx->head->field == &ctx->root) {
+    const char* expected;
+    const char* quote;
+    if (ctx->head == NULL) {
+      expected = "end";
+      quote = "";
+    } else {
+      expected = ctx->head->field->type->name;
+      quote = "'";
+    }
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected %s%s%s but got %s",
+                 quote, expected, quote,
+                 __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex));
+  } else {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_StructField* parent = (ctx->head - 1)->field;
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected '%s' but got %s in '%s.%s'",
+                 field->type->name, __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex),
+                 parent->type->name, field->name);
+  }
+}
+static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
+  char group;
+  size_t size, offset, arraysize = 1;
+  if (ctx->enc_type == 0) return 0;
+  if (ctx->head->field->type->arraysize[0]) {
+    int i, ndim = 0;
+    if (ctx->enc_type == 's' || ctx->enc_type == 'p') {
+        ctx->is_valid_array = ctx->head->field->type->ndim == 1;
+        ndim = 1;
+        if (ctx->enc_count != ctx->head->field->type->arraysize[0]) {
+            PyErr_Format(PyExc_ValueError,
+                         "Expected a dimension of size %zu, got %zu",
+                         ctx->head->field->type->arraysize[0], ctx->enc_count);
+            return -1;
+        }
+    }
+    if (!ctx->is_valid_array) {
+      PyErr_Format(PyExc_ValueError, "Expected %d dimensions, got %d",
+                   ctx->head->field->type->ndim, ndim);
+      return -1;
+    }
+    for (i = 0; i < ctx->head->field->type->ndim; i++) {
+      arraysize *= ctx->head->field->type->arraysize[i];
+    }
+    ctx->is_valid_array = 0;
+    ctx->enc_count = 1;
+  }
+  group = __Pyx_BufFmt_TypeCharToGroup(ctx->enc_type, ctx->is_complex);
+  do {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_TypeInfo* type = field->type;
+    if (ctx->enc_packmode == '@' || ctx->enc_packmode == '^') {
+      size = __Pyx_BufFmt_TypeCharToNativeSize(ctx->enc_type, ctx->is_complex);
+    } else {
+      size = __Pyx_BufFmt_TypeCharToStandardSize(ctx->enc_type, ctx->is_complex);
+    }
+    if (ctx->enc_packmode == '@') {
+      size_t align_at = __Pyx_BufFmt_TypeCharToAlignment(ctx->enc_type, ctx->is_complex);
+      size_t align_mod_offset;
+      if (align_at == 0) return -1;
+      align_mod_offset = ctx->fmt_offset % align_at;
+      if (align_mod_offset > 0) ctx->fmt_offset += align_at - align_mod_offset;
+      if (ctx->struct_alignment == 0)
+          ctx->struct_alignment = __Pyx_BufFmt_TypeCharToPadding(ctx->enc_type,
+                                                                 ctx->is_complex);
+    }
+    if (type->size != size || type->typegroup != group) {
+      if (type->typegroup == 'C' && type->fields != NULL) {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        ++ctx->head;
+        ctx->head->field = type->fields;
+        ctx->head->parent_offset = parent_offset;
+        continue;
+      }
+      if ((type->typegroup == 'H' || group == 'H') && type->size == size) {
+      } else {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+      }
+    }
+    offset = ctx->head->parent_offset + field->offset;
+    if (ctx->fmt_offset != offset) {
+      PyErr_Format(PyExc_ValueError,
+                   "Buffer dtype mismatch; next field is at offset %" CYTHON_FORMAT_SSIZE_T "d but %" CYTHON_FORMAT_SSIZE_T "d expected",
+                   (Py_ssize_t)ctx->fmt_offset, (Py_ssize_t)offset);
+      return -1;
+    }
+    ctx->fmt_offset += size;
+    if (arraysize)
+      ctx->fmt_offset += (arraysize - 1) * size;
+    --ctx->enc_count;
+    while (1) {
+      if (field == &ctx->root) {
+        ctx->head = NULL;
+        if (ctx->enc_count != 0) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+        }
+        break;
+      }
+      ctx->head->field = ++field;
+      if (field->type == NULL) {
+        --ctx->head;
+        field = ctx->head->field;
+        continue;
+      } else if (field->type->typegroup == 'S') {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        if (field->type->fields->type == NULL) continue;
+        field = field->type->fields;
+        ++ctx->head;
+        ctx->head->field = field;
+        ctx->head->parent_offset = parent_offset;
+        break;
+      } else {
+        break;
+      }
+    }
+  } while (ctx->enc_count);
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  return 0;
+}
+static CYTHON_INLINE PyObject *
+__pyx_buffmt_parse_array(__Pyx_BufFmt_Context* ctx, const char** tsp)
+{
+    const char *ts = *tsp;
+    int i = 0, number;
+    int ndim = ctx->head->field->type->ndim;
+;
+    ++ts;
+    if (ctx->new_count != 1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Cannot handle repeated arrays in format string");
+        return NULL;
+    }
+    if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+    while (*ts && *ts != ')') {
+        switch (*ts) {
+            case ' ': case '\f': case '\r': case '\n': case '\t': case '\v':  continue;
+            default:  break;
+        }
+        number = __Pyx_BufFmt_ExpectNumber(&ts);
+        if (number == -1) return NULL;
+        if (i < ndim && (size_t) number != ctx->head->field->type->arraysize[i])
+            return PyErr_Format(PyExc_ValueError,
+                        "Expected a dimension of size %zu, got %d",
+                        ctx->head->field->type->arraysize[i], number);
+        if (*ts != ',' && *ts != ')')
+            return PyErr_Format(PyExc_ValueError,
+                                "Expected a comma in format string, got '%c'", *ts);
+        if (*ts == ',') ts++;
+        i++;
+    }
+    if (i != ndim)
+        return PyErr_Format(PyExc_ValueError, "Expected %d dimension(s), got %d",
+                            ctx->head->field->type->ndim, i);
+    if (!*ts) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unexpected end of format string, expected ')'");
+        return NULL;
+    }
+    ctx->is_valid_array = 1;
+    ctx->new_count = 1;
+    *tsp = ++ts;
+    return Py_None;
+}
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts) {
+  int got_Z = 0;
+  while (1) {
+    switch(*ts) {
+      case 0:
+        if (ctx->enc_type != 0 && ctx->head == NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        if (ctx->head != NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        return ts;
+      case ' ':
+      case '\r':
+      case '\n':
+        ++ts;
+        break;
+      case '<':
+        if (!__Pyx_IsLittleEndian()) {
+          PyErr_SetString(PyExc_ValueError, "Little-endian buffer not supported on big-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '>':
+      case '!':
+        if (__Pyx_IsLittleEndian()) {
+          PyErr_SetString(PyExc_ValueError, "Big-endian buffer not supported on little-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '=':
+      case '@':
+      case '^':
+        ctx->new_packmode = *ts++;
+        break;
+      case 'T':
+        {
+          const char* ts_after_sub;
+          size_t i, struct_count = ctx->new_count;
+          size_t struct_alignment = ctx->struct_alignment;
+          ctx->new_count = 1;
+          ++ts;
+          if (*ts != '{') {
+            PyErr_SetString(PyExc_ValueError, "Buffer acquisition: Expected '{' after 'T'");
+            return NULL;
+          }
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          ctx->enc_count = 0;
+          ctx->struct_alignment = 0;
+          ++ts;
+          ts_after_sub = ts;
+          for (i = 0; i != struct_count; ++i) {
+            ts_after_sub = __Pyx_BufFmt_CheckString(ctx, ts);
+            if (!ts_after_sub) return NULL;
+          }
+          ts = ts_after_sub;
+          if (struct_alignment) ctx->struct_alignment = struct_alignment;
+        }
+        break;
+      case '}':
+        {
+          size_t alignment = ctx->struct_alignment;
+          ++ts;
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          if (alignment && ctx->fmt_offset % alignment) {
+            ctx->fmt_offset += alignment - (ctx->fmt_offset % alignment);
+          }
+        }
+        return ts;
+      case 'x':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->fmt_offset += ctx->new_count;
+        ctx->new_count = 1;
+        ctx->enc_count = 0;
+        ctx->enc_type = 0;
+        ctx->enc_packmode = ctx->new_packmode;
+        ++ts;
+        break;
+      case 'Z':
+        got_Z = 1;
+        ++ts;
+        if (*ts != 'f' && *ts != 'd' && *ts != 'g') {
+          __Pyx_BufFmt_RaiseUnexpectedChar('Z');
+          return NULL;
+        }
+      case 'c': case 'b': case 'B': case 'h': case 'H': case 'i': case 'I':
+      case 'l': case 'L': case 'q': case 'Q':
+      case 'f': case 'd': case 'g':
+      case 'O': case 'p':
+        if (ctx->enc_type == *ts && got_Z == ctx->is_complex &&
+            ctx->enc_packmode == ctx->new_packmode) {
+          ctx->enc_count += ctx->new_count;
+          ctx->new_count = 1;
+          got_Z = 0;
+          ++ts;
+          break;
+        }
+      case 's':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->enc_count = ctx->new_count;
+        ctx->enc_packmode = ctx->new_packmode;
+        ctx->enc_type = *ts;
+        ctx->is_complex = got_Z;
+        ++ts;
+        ctx->new_count = 1;
+        got_Z = 0;
+        break;
+      case ':':
+        ++ts;
+        while(*ts != ':') ++ts;
+        ++ts;
+        break;
+      case '(':
+        if (!__pyx_buffmt_parse_array(ctx, &ts)) return NULL;
+        break;
+      default:
+        {
+          int number = __Pyx_BufFmt_ExpectNumber(&ts);
+          if (number == -1) return NULL;
+          ctx->new_count = (size_t)number;
+        }
+    }
+  }
+}
+static CYTHON_INLINE void __Pyx_ZeroBuffer(Py_buffer* buf) {
+  buf->buf = NULL;
+  buf->obj = NULL;
+  buf->strides = __Pyx_zeros;
+  buf->shape = __Pyx_zeros;
+  buf->suboffsets = __Pyx_minusones;
+}
+static CYTHON_INLINE int __Pyx_GetBufferAndValidate(
+        Py_buffer* buf, PyObject* obj,  __Pyx_TypeInfo* dtype, int flags,
+        int nd, int cast, __Pyx_BufFmt_StackElem* stack)
+{
+  if (obj == Py_None || obj == NULL) {
+    __Pyx_ZeroBuffer(buf);
+    return 0;
+  }
+  buf->buf = NULL;
+  if (__Pyx_GetBuffer(obj, buf, flags) == -1) goto fail;
+  if (buf->ndim != nd) {
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer has wrong number of dimensions (expected %d, got %d)",
+                 nd, buf->ndim);
+    goto fail;
+  }
+  if (!cast) {
+    __Pyx_BufFmt_Context ctx;
+    __Pyx_BufFmt_Init(&ctx, stack, dtype);
+    if (!__Pyx_BufFmt_CheckString(&ctx, buf->format)) goto fail;
+  }
+  if ((unsigned)buf->itemsize != dtype->size) {
+    PyErr_Format(PyExc_ValueError,
+      "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "d byte%s) does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "d byte%s)",
+      buf->itemsize, (buf->itemsize > 1) ? "s" : "",
+      dtype->name, (Py_ssize_t)dtype->size, (dtype->size > 1) ? "s" : "");
+    goto fail;
+  }
+  if (buf->suboffsets == NULL) buf->suboffsets = __Pyx_minusones;
+  return 0;
+fail:;
+  __Pyx_ZeroBuffer(buf);
+  return -1;
+}
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info) {
+  if (info->buf == NULL) return;
+  if (info->suboffsets == __Pyx_minusones) info->suboffsets = NULL;
+  __Pyx_ReleaseBuffer(info);
+}
+
+/* GetModuleGlobalName */
+    static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) {
+    PyObject *result;
+#if !CYTHON_AVOID_BORROWED_REFS
+    result = PyDict_GetItem(__pyx_d, name);
+    if (likely(result)) {
+        Py_INCREF(result);
+    } else {
+#else
+    result = PyObject_GetItem(__pyx_d, name);
+    if (!result) {
+        PyErr_Clear();
+#endif
+        result = __Pyx_GetBuiltinName(name);
+    }
+    return result;
+}
+
+/* PyCFunctionFastCall */
+      #if CYTHON_FAST_PYCCALL
+static CYTHON_INLINE PyObject * __Pyx_PyCFunction_FastCall(PyObject *func_obj, PyObject **args, Py_ssize_t nargs) {
+    PyCFunctionObject *func = (PyCFunctionObject*)func_obj;
+    PyCFunction meth = PyCFunction_GET_FUNCTION(func);
+    PyObject *self = PyCFunction_GET_SELF(func);
+    assert(PyCFunction_Check(func));
+    assert(METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST)));
+    assert(nargs >= 0);
+    assert(nargs == 0 || args != NULL);
+    /* _PyCFunction_FastCallDict() must not be called with an exception set,
+       because it may clear it (directly or indirectly) and so the
+       caller loses its exception */
+    assert(!PyErr_Occurred());
+    return (*((__Pyx_PyCFunctionFast)meth)) (self, args, nargs, NULL);
+}
+#endif  // CYTHON_FAST_PYCCALL
+
+/* PyFunctionFastCall */
+      #if CYTHON_FAST_PYCALL
+#include "frameobject.h"
+static PyObject* __Pyx_PyFunction_FastCallNoKw(PyCodeObject *co, PyObject **args, Py_ssize_t na,
+                                               PyObject *globals) {
+    PyFrameObject *f;
+    PyThreadState *tstate = PyThreadState_GET();
+    PyObject **fastlocals;
+    Py_ssize_t i;
+    PyObject *result;
+    assert(globals != NULL);
+    /* XXX Perhaps we should create a specialized
+       PyFrame_New() that doesn't take locals, but does
+       take builtins without sanity checking them.
+       */
+    assert(tstate != NULL);
+    f = PyFrame_New(tstate, co, globals, NULL);
+    if (f == NULL) {
+        return NULL;
+    }
+    fastlocals = f->f_localsplus;
+    for (i = 0; i < na; i++) {
+        Py_INCREF(*args);
+        fastlocals[i] = *args++;
+    }
+    result = PyEval_EvalFrameEx(f,0);
+    ++tstate->recursion_depth;
+    Py_DECREF(f);
+    --tstate->recursion_depth;
+    return result;
+}
+#if 1 || PY_VERSION_HEX < 0x030600B1
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs) {
+    PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
+    PyObject *globals = PyFunction_GET_GLOBALS(func);
+    PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
+    PyObject *closure;
+#if PY_MAJOR_VERSION >= 3
+    PyObject *kwdefs;
+#endif
+    PyObject *kwtuple, **k;
+    PyObject **d;
+    Py_ssize_t nd;
+    Py_ssize_t nk;
+    PyObject *result;
+    assert(kwargs == NULL || PyDict_Check(kwargs));
+    nk = kwargs ? PyDict_Size(kwargs) : 0;
+    if (Py_EnterRecursiveCall((char*)" while calling a Python object")) {
+        return NULL;
+    }
+    if (
+#if PY_MAJOR_VERSION >= 3
+            co->co_kwonlyargcount == 0 &&
+#endif
+            likely(kwargs == NULL || nk == 0) &&
+            co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) {
+        if (argdefs == NULL && co->co_argcount == nargs) {
+            result = __Pyx_PyFunction_FastCallNoKw(co, args, nargs, globals);
+            goto done;
+        }
+        else if (nargs == 0 && argdefs != NULL
+                 && co->co_argcount == Py_SIZE(argdefs)) {
+            /* function called with no arguments, but all parameters have
+               a default value: use default values as arguments .*/
+            args = &PyTuple_GET_ITEM(argdefs, 0);
+            result =__Pyx_PyFunction_FastCallNoKw(co, args, Py_SIZE(argdefs), globals);
+            goto done;
+        }
+    }
+    if (kwargs != NULL) {
+        Py_ssize_t pos, i;
+        kwtuple = PyTuple_New(2 * nk);
+        if (kwtuple == NULL) {
+            result = NULL;
+            goto done;
+        }
+        k = &PyTuple_GET_ITEM(kwtuple, 0);
+        pos = i = 0;
+        while (PyDict_Next(kwargs, &pos, &k[i], &k[i+1])) {
+            Py_INCREF(k[i]);
+            Py_INCREF(k[i+1]);
+            i += 2;
+        }
+        nk = i / 2;
+    }
+    else {
+        kwtuple = NULL;
+        k = NULL;
+    }
+    closure = PyFunction_GET_CLOSURE(func);
+#if PY_MAJOR_VERSION >= 3
+    kwdefs = PyFunction_GET_KW_DEFAULTS(func);
+#endif
+    if (argdefs != NULL) {
+        d = &PyTuple_GET_ITEM(argdefs, 0);
+        nd = Py_SIZE(argdefs);
+    }
+    else {
+        d = NULL;
+        nd = 0;
+    }
+#if PY_MAJOR_VERSION >= 3
+    result = PyEval_EvalCodeEx((PyObject*)co, globals, (PyObject *)NULL,
+                               args, nargs,
+                               k, (int)nk,
+                               d, (int)nd, kwdefs, closure);
+#else
+    result = PyEval_EvalCodeEx(co, globals, (PyObject *)NULL,
+                               args, nargs,
+                               k, (int)nk,
+                               d, (int)nd, closure);
+#endif
+    Py_XDECREF(kwtuple);
+done:
+    Py_LeaveRecursiveCall();
+    return result;
+}
+#endif  // CPython < 3.6
+#endif  // CYTHON_FAST_PYCALL
+
+/* PyObjectCallMethO */
+      #if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg) {
+    PyObject *self, *result;
+    PyCFunction cfunc;
+    cfunc = PyCFunction_GET_FUNCTION(func);
+    self = PyCFunction_GET_SELF(func);
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    result = cfunc(self, arg);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyObjectCallOneArg */
+      #if CYTHON_COMPILING_IN_CPYTHON
+static PyObject* __Pyx__PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+    PyObject *result;
+    PyObject *args = PyTuple_New(1);
+    if (unlikely(!args)) return NULL;
+    Py_INCREF(arg);
+    PyTuple_SET_ITEM(args, 0, arg);
+    result = __Pyx_PyObject_Call(func, args, NULL);
+    Py_DECREF(args);
+    return result;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+#if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(func)) {
+        return __Pyx_PyFunction_FastCall(func, &arg, 1);
+    }
+#endif
+#ifdef __Pyx_CyFunction_USED
+    if (likely(PyCFunction_Check(func) || PyObject_TypeCheck(func, __pyx_CyFunctionType))) {
+#else
+    if (likely(PyCFunction_Check(func))) {
+#endif
+        if (likely(PyCFunction_GET_FLAGS(func) & METH_O)) {
+            return __Pyx_PyObject_CallMethO(func, arg);
+#if CYTHON_FAST_PYCCALL
+        } else if (PyCFunction_GET_FLAGS(func) & METH_FASTCALL) {
+            return __Pyx_PyCFunction_FastCall(func, &arg, 1);
+#endif
+        }
+    }
+    return __Pyx__PyObject_CallOneArg(func, arg);
+}
+#else
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+    PyObject *result;
+    PyObject *args = PyTuple_Pack(1, arg);
+    if (unlikely(!args)) return NULL;
+    result = __Pyx_PyObject_Call(func, args, NULL);
+    Py_DECREF(args);
+    return result;
+}
+#endif
+
+/* PyIntBinop */
+        #if !CYTHON_COMPILING_IN_PYPY
+static PyObject* __Pyx_PyInt_EqObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED long intval, CYTHON_UNUSED int inplace) {
+    if (op1 == op2) {
+        Py_RETURN_TRUE;
+    }
+    #if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_CheckExact(op1))) {
+        const long b = intval;
+        long a = PyInt_AS_LONG(op1);
+        if (a == b) {
+            Py_RETURN_TRUE;
+        } else {
+            Py_RETURN_FALSE;
+        }
+    }
+    #endif
+    #if CYTHON_USE_PYLONG_INTERNALS
+    if (likely(PyLong_CheckExact(op1))) {
+        const long b = intval;
+        long a;
+        const digit* digits = ((PyLongObject*)op1)->ob_digit;
+        const Py_ssize_t size = Py_SIZE(op1);
+        if (likely(__Pyx_sst_abs(size) <= 1)) {
+            a = likely(size) ? digits[0] : 0;
+            if (size == -1) a = -a;
+        } else {
+            switch (size) {
+                case -2:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        a = -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                case 2:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        a = (long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                case -3:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        a = -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                case 3:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        a = (long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                case -4:
+                    if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                        a = -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                case 4:
+                    if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                        a = (long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0]));
+                        break;
+                    }
+                #if PyLong_SHIFT < 30 && PyLong_SHIFT != 15
+                default: return PyLong_Type.tp_richcompare(op1, op2, Py_EQ);
+                #else
+                default: Py_RETURN_FALSE;
+                #endif
+            }
+        }
+            if (a == b) {
+                Py_RETURN_TRUE;
+            } else {
+                Py_RETURN_FALSE;
+            }
+    }
+    #endif
+    if (PyFloat_CheckExact(op1)) {
+        const long b = intval;
+        double a = PyFloat_AS_DOUBLE(op1);
+            if ((double)a == (double)b) {
+                Py_RETURN_TRUE;
+            } else {
+                Py_RETURN_FALSE;
+            }
+    }
+    return PyObject_RichCompare(op1, op2, Py_EQ);
+}
+#endif
+
+/* FetchCommonType */
+        static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type) {
+    PyObject* fake_module;
+    PyTypeObject* cached_type = NULL;
+    fake_module = PyImport_AddModule((char*) "_cython_" CYTHON_ABI);
+    if (!fake_module) return NULL;
+    Py_INCREF(fake_module);
+    cached_type = (PyTypeObject*) PyObject_GetAttrString(fake_module, type->tp_name);
+    if (cached_type) {
+        if (!PyType_Check((PyObject*)cached_type)) {
+            PyErr_Format(PyExc_TypeError,
+                "Shared Cython type %.200s is not a type object",
+                type->tp_name);
+            goto bad;
+        }
+        if (cached_type->tp_basicsize != type->tp_basicsize) {
+            PyErr_Format(PyExc_TypeError,
+                "Shared Cython type %.200s has the wrong size, try recompiling",
+                type->tp_name);
+            goto bad;
+        }
+    } else {
+        if (!PyErr_ExceptionMatches(PyExc_AttributeError)) goto bad;
+        PyErr_Clear();
+        if (PyType_Ready(type) < 0) goto bad;
+        if (PyObject_SetAttrString(fake_module, type->tp_name, (PyObject*) type) < 0)
+            goto bad;
+        Py_INCREF(type);
+        cached_type = type;
+    }
+done:
+    Py_DECREF(fake_module);
+    return cached_type;
+bad:
+    Py_XDECREF(cached_type);
+    cached_type = NULL;
+    goto done;
+}
+
+/* CythonFunction */
+        static PyObject *
+__Pyx_CyFunction_get_doc(__pyx_CyFunctionObject *op, CYTHON_UNUSED void *closure)
+{
+    if (unlikely(op->func_doc == NULL)) {
+        if (op->func.m_ml->ml_doc) {
+#if PY_MAJOR_VERSION >= 3
+            op->func_doc = PyUnicode_FromString(op->func.m_ml->ml_doc);
+#else
+            op->func_doc = PyString_FromString(op->func.m_ml->ml_doc);
+#endif
+            if (unlikely(op->func_doc == NULL))
+                return NULL;
+        } else {
+            Py_INCREF(Py_None);
+            return Py_None;
+        }
+    }
+    Py_INCREF(op->func_doc);
+    return op->func_doc;
+}
+static int
+__Pyx_CyFunction_set_doc(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp = op->func_doc;
+    if (value == NULL) {
+        value = Py_None;
+    }
+    Py_INCREF(value);
+    op->func_doc = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_name(__pyx_CyFunctionObject *op)
+{
+    if (unlikely(op->func_name == NULL)) {
+#if PY_MAJOR_VERSION >= 3
+        op->func_name = PyUnicode_InternFromString(op->func.m_ml->ml_name);
+#else
+        op->func_name = PyString_InternFromString(op->func.m_ml->ml_name);
+#endif
+        if (unlikely(op->func_name == NULL))
+            return NULL;
+    }
+    Py_INCREF(op->func_name);
+    return op->func_name;
+}
+static int
+__Pyx_CyFunction_set_name(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp;
+#if PY_MAJOR_VERSION >= 3
+    if (unlikely(value == NULL || !PyUnicode_Check(value))) {
+#else
+    if (unlikely(value == NULL || !PyString_Check(value))) {
+#endif
+        PyErr_SetString(PyExc_TypeError,
+                        "__name__ must be set to a string object");
+        return -1;
+    }
+    tmp = op->func_name;
+    Py_INCREF(value);
+    op->func_name = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_qualname(__pyx_CyFunctionObject *op)
+{
+    Py_INCREF(op->func_qualname);
+    return op->func_qualname;
+}
+static int
+__Pyx_CyFunction_set_qualname(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp;
+#if PY_MAJOR_VERSION >= 3
+    if (unlikely(value == NULL || !PyUnicode_Check(value))) {
+#else
+    if (unlikely(value == NULL || !PyString_Check(value))) {
+#endif
+        PyErr_SetString(PyExc_TypeError,
+                        "__qualname__ must be set to a string object");
+        return -1;
+    }
+    tmp = op->func_qualname;
+    Py_INCREF(value);
+    op->func_qualname = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_self(__pyx_CyFunctionObject *m, CYTHON_UNUSED void *closure)
+{
+    PyObject *self;
+    self = m->func_closure;
+    if (self == NULL)
+        self = Py_None;
+    Py_INCREF(self);
+    return self;
+}
+static PyObject *
+__Pyx_CyFunction_get_dict(__pyx_CyFunctionObject *op)
+{
+    if (unlikely(op->func_dict == NULL)) {
+        op->func_dict = PyDict_New();
+        if (unlikely(op->func_dict == NULL))
+            return NULL;
+    }
+    Py_INCREF(op->func_dict);
+    return op->func_dict;
+}
+static int
+__Pyx_CyFunction_set_dict(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp;
+    if (unlikely(value == NULL)) {
+        PyErr_SetString(PyExc_TypeError,
+               "function's dictionary may not be deleted");
+        return -1;
+    }
+    if (unlikely(!PyDict_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+               "setting function's dictionary to a non-dict");
+        return -1;
+    }
+    tmp = op->func_dict;
+    Py_INCREF(value);
+    op->func_dict = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_globals(__pyx_CyFunctionObject *op)
+{
+    Py_INCREF(op->func_globals);
+    return op->func_globals;
+}
+static PyObject *
+__Pyx_CyFunction_get_closure(CYTHON_UNUSED __pyx_CyFunctionObject *op)
+{
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+static PyObject *
+__Pyx_CyFunction_get_code(__pyx_CyFunctionObject *op)
+{
+    PyObject* result = (op->func_code) ? op->func_code : Py_None;
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_init_defaults(__pyx_CyFunctionObject *op) {
+    int result = 0;
+    PyObject *res = op->defaults_getter((PyObject *) op);
+    if (unlikely(!res))
+        return -1;
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    op->defaults_tuple = PyTuple_GET_ITEM(res, 0);
+    Py_INCREF(op->defaults_tuple);
+    op->defaults_kwdict = PyTuple_GET_ITEM(res, 1);
+    Py_INCREF(op->defaults_kwdict);
+    #else
+    op->defaults_tuple = PySequence_ITEM(res, 0);
+    if (unlikely(!op->defaults_tuple)) result = -1;
+    else {
+        op->defaults_kwdict = PySequence_ITEM(res, 1);
+        if (unlikely(!op->defaults_kwdict)) result = -1;
+    }
+    #endif
+    Py_DECREF(res);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_defaults(__pyx_CyFunctionObject *op, PyObject* value) {
+    PyObject* tmp;
+    if (!value) {
+        value = Py_None;
+    } else if (value != Py_None && !PyTuple_Check(value)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__defaults__ must be set to a tuple object");
+        return -1;
+    }
+    Py_INCREF(value);
+    tmp = op->defaults_tuple;
+    op->defaults_tuple = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_defaults(__pyx_CyFunctionObject *op) {
+    PyObject* result = op->defaults_tuple;
+    if (unlikely(!result)) {
+        if (op->defaults_getter) {
+            if (__Pyx_CyFunction_init_defaults(op) < 0) return NULL;
+            result = op->defaults_tuple;
+        } else {
+            result = Py_None;
+        }
+    }
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_kwdefaults(__pyx_CyFunctionObject *op, PyObject* value) {
+    PyObject* tmp;
+    if (!value) {
+        value = Py_None;
+    } else if (value != Py_None && !PyDict_Check(value)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__kwdefaults__ must be set to a dict object");
+        return -1;
+    }
+    Py_INCREF(value);
+    tmp = op->defaults_kwdict;
+    op->defaults_kwdict = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_kwdefaults(__pyx_CyFunctionObject *op) {
+    PyObject* result = op->defaults_kwdict;
+    if (unlikely(!result)) {
+        if (op->defaults_getter) {
+            if (__Pyx_CyFunction_init_defaults(op) < 0) return NULL;
+            result = op->defaults_kwdict;
+        } else {
+            result = Py_None;
+        }
+    }
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_annotations(__pyx_CyFunctionObject *op, PyObject* value) {
+    PyObject* tmp;
+    if (!value || value == Py_None) {
+        value = NULL;
+    } else if (!PyDict_Check(value)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__annotations__ must be set to a dict object");
+        return -1;
+    }
+    Py_XINCREF(value);
+    tmp = op->func_annotations;
+    op->func_annotations = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_annotations(__pyx_CyFunctionObject *op) {
+    PyObject* result = op->func_annotations;
+    if (unlikely(!result)) {
+        result = PyDict_New();
+        if (unlikely(!result)) return NULL;
+        op->func_annotations = result;
+    }
+    Py_INCREF(result);
+    return result;
+}
+static PyGetSetDef __pyx_CyFunction_getsets[] = {
+    {(char *) "func_doc", (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
+    {(char *) "__doc__",  (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
+    {(char *) "func_name", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
+    {(char *) "__name__", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
+    {(char *) "__qualname__", (getter)__Pyx_CyFunction_get_qualname, (setter)__Pyx_CyFunction_set_qualname, 0, 0},
+    {(char *) "__self__", (getter)__Pyx_CyFunction_get_self, 0, 0, 0},
+    {(char *) "func_dict", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
+    {(char *) "__dict__", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
+    {(char *) "func_globals", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
+    {(char *) "__globals__", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
+    {(char *) "func_closure", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
+    {(char *) "__closure__", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
+    {(char *) "func_code", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
+    {(char *) "__code__", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
+    {(char *) "func_defaults", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
+    {(char *) "__defaults__", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
+    {(char *) "__kwdefaults__", (getter)__Pyx_CyFunction_get_kwdefaults, (setter)__Pyx_CyFunction_set_kwdefaults, 0, 0},
+    {(char *) "__annotations__", (getter)__Pyx_CyFunction_get_annotations, (setter)__Pyx_CyFunction_set_annotations, 0, 0},
+    {0, 0, 0, 0, 0}
+};
+static PyMemberDef __pyx_CyFunction_members[] = {
+    {(char *) "__module__", T_OBJECT, offsetof(__pyx_CyFunctionObject, func.m_module), PY_WRITE_RESTRICTED, 0},
+    {0, 0, 0,  0, 0}
+};
+static PyObject *
+__Pyx_CyFunction_reduce(__pyx_CyFunctionObject *m, CYTHON_UNUSED PyObject *args)
+{
+#if PY_MAJOR_VERSION >= 3
+    return PyUnicode_FromString(m->func.m_ml->ml_name);
+#else
+    return PyString_FromString(m->func.m_ml->ml_name);
+#endif
+}
+static PyMethodDef __pyx_CyFunction_methods[] = {
+    {"__reduce__", (PyCFunction)__Pyx_CyFunction_reduce, METH_VARARGS, 0},
+    {0, 0, 0, 0}
+};
+#if PY_VERSION_HEX < 0x030500A0
+#define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func_weakreflist)
+#else
+#define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func.m_weakreflist)
+#endif
+static PyObject *__Pyx_CyFunction_New(PyTypeObject *type, PyMethodDef *ml, int flags, PyObject* qualname,
+                                      PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) {
+    __pyx_CyFunctionObject *op = PyObject_GC_New(__pyx_CyFunctionObject, type);
+    if (op == NULL)
+        return NULL;
+    op->flags = flags;
+    __Pyx_CyFunction_weakreflist(op) = NULL;
+    op->func.m_ml = ml;
+    op->func.m_self = (PyObject *) op;
+    Py_XINCREF(closure);
+    op->func_closure = closure;
+    Py_XINCREF(module);
+    op->func.m_module = module;
+    op->func_dict = NULL;
+    op->func_name = NULL;
+    Py_INCREF(qualname);
+    op->func_qualname = qualname;
+    op->func_doc = NULL;
+    op->func_classobj = NULL;
+    op->func_globals = globals;
+    Py_INCREF(op->func_globals);
+    Py_XINCREF(code);
+    op->func_code = code;
+    op->defaults_pyobjects = 0;
+    op->defaults = NULL;
+    op->defaults_tuple = NULL;
+    op->defaults_kwdict = NULL;
+    op->defaults_getter = NULL;
+    op->func_annotations = NULL;
+    PyObject_GC_Track(op);
+    return (PyObject *) op;
+}
+static int
+__Pyx_CyFunction_clear(__pyx_CyFunctionObject *m)
+{
+    Py_CLEAR(m->func_closure);
+    Py_CLEAR(m->func.m_module);
+    Py_CLEAR(m->func_dict);
+    Py_CLEAR(m->func_name);
+    Py_CLEAR(m->func_qualname);
+    Py_CLEAR(m->func_doc);
+    Py_CLEAR(m->func_globals);
+    Py_CLEAR(m->func_code);
+    Py_CLEAR(m->func_classobj);
+    Py_CLEAR(m->defaults_tuple);
+    Py_CLEAR(m->defaults_kwdict);
+    Py_CLEAR(m->func_annotations);
+    if (m->defaults) {
+        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
+        int i;
+        for (i = 0; i < m->defaults_pyobjects; i++)
+            Py_XDECREF(pydefaults[i]);
+        PyObject_Free(m->defaults);
+        m->defaults = NULL;
+    }
+    return 0;
+}
+static void __Pyx_CyFunction_dealloc(__pyx_CyFunctionObject *m)
+{
+    PyObject_GC_UnTrack(m);
+    if (__Pyx_CyFunction_weakreflist(m) != NULL)
+        PyObject_ClearWeakRefs((PyObject *) m);
+    __Pyx_CyFunction_clear(m);
+    PyObject_GC_Del(m);
+}
+static int __Pyx_CyFunction_traverse(__pyx_CyFunctionObject *m, visitproc visit, void *arg)
+{
+    Py_VISIT(m->func_closure);
+    Py_VISIT(m->func.m_module);
+    Py_VISIT(m->func_dict);
+    Py_VISIT(m->func_name);
+    Py_VISIT(m->func_qualname);
+    Py_VISIT(m->func_doc);
+    Py_VISIT(m->func_globals);
+    Py_VISIT(m->func_code);
+    Py_VISIT(m->func_classobj);
+    Py_VISIT(m->defaults_tuple);
+    Py_VISIT(m->defaults_kwdict);
+    if (m->defaults) {
+        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
+        int i;
+        for (i = 0; i < m->defaults_pyobjects; i++)
+            Py_VISIT(pydefaults[i]);
+    }
+    return 0;
+}
+static PyObject *__Pyx_CyFunction_descr_get(PyObject *func, PyObject *obj, PyObject *type)
+{
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    if (m->flags & __Pyx_CYFUNCTION_STATICMETHOD) {
+        Py_INCREF(func);
+        return func;
+    }
+    if (m->flags & __Pyx_CYFUNCTION_CLASSMETHOD) {
+        if (type == NULL)
+            type = (PyObject *)(Py_TYPE(obj));
+        return __Pyx_PyMethod_New(func, type, (PyObject *)(Py_TYPE(type)));
+    }
+    if (obj == Py_None)
+        obj = NULL;
+    return __Pyx_PyMethod_New(func, obj, type);
+}
+static PyObject*
+__Pyx_CyFunction_repr(__pyx_CyFunctionObject *op)
+{
+#if PY_MAJOR_VERSION >= 3
+    return PyUnicode_FromFormat("<cyfunction %U at %p>",
+                                op->func_qualname, (void *)op);
+#else
+    return PyString_FromFormat("<cyfunction %s at %p>",
+                               PyString_AsString(op->func_qualname), (void *)op);
+#endif
+}
+static PyObject * __Pyx_CyFunction_CallMethod(PyObject *func, PyObject *self, PyObject *arg, PyObject *kw) {
+    PyCFunctionObject* f = (PyCFunctionObject*)func;
+    PyCFunction meth = f->m_ml->ml_meth;
+    Py_ssize_t size;
+    switch (f->m_ml->ml_flags & (METH_VARARGS | METH_KEYWORDS | METH_NOARGS | METH_O)) {
+    case METH_VARARGS:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0))
+            return (*meth)(self, arg);
+        break;
+    case METH_VARARGS | METH_KEYWORDS:
+        return (*(PyCFunctionWithKeywords)meth)(self, arg, kw);
+    case METH_NOARGS:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
+            size = PyTuple_GET_SIZE(arg);
+            if (likely(size == 0))
+                return (*meth)(self, NULL);
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes no arguments (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                f->m_ml->ml_name, size);
+            return NULL;
+        }
+        break;
+    case METH_O:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
+            size = PyTuple_GET_SIZE(arg);
+            if (likely(size == 1)) {
+                PyObject *result, *arg0 = PySequence_ITEM(arg, 0);
+                if (unlikely(!arg0)) return NULL;
+                result = (*meth)(self, arg0);
+                Py_DECREF(arg0);
+                return result;
+            }
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes exactly one argument (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                f->m_ml->ml_name, size);
+            return NULL;
+        }
+        break;
+    default:
+        PyErr_SetString(PyExc_SystemError, "Bad call flags in "
+                        "__Pyx_CyFunction_Call. METH_OLDARGS is no "
+                        "longer supported!");
+        return NULL;
+    }
+    PyErr_Format(PyExc_TypeError, "%.200s() takes no keyword arguments",
+                 f->m_ml->ml_name);
+    return NULL;
+}
+static CYTHON_INLINE PyObject *__Pyx_CyFunction_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    return __Pyx_CyFunction_CallMethod(func, ((PyCFunctionObject*)func)->m_self, arg, kw);
+}
+static PyObject *__Pyx_CyFunction_CallAsMethod(PyObject *func, PyObject *args, PyObject *kw) {
+    PyObject *result;
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *) func;
+    if ((cyfunc->flags & __Pyx_CYFUNCTION_CCLASS) && !(cyfunc->flags & __Pyx_CYFUNCTION_STATICMETHOD)) {
+        Py_ssize_t argc;
+        PyObject *new_args;
+        PyObject *self;
+        argc = PyTuple_GET_SIZE(args);
+        new_args = PyTuple_GetSlice(args, 1, argc);
+        if (unlikely(!new_args))
+            return NULL;
+        self = PyTuple_GetItem(args, 0);
+        if (unlikely(!self)) {
+            Py_DECREF(new_args);
+            return NULL;
+        }
+        result = __Pyx_CyFunction_CallMethod(func, self, new_args, kw);
+        Py_DECREF(new_args);
+    } else {
+        result = __Pyx_CyFunction_Call(func, args, kw);
+    }
+    return result;
+}
+static PyTypeObject __pyx_CyFunctionType_type = {
+    PyVarObject_HEAD_INIT(0, 0)
+    "cython_function_or_method",
+    sizeof(__pyx_CyFunctionObject),
+    0,
+    (destructor) __Pyx_CyFunction_dealloc,
+    0,
+    0,
+    0,
+#if PY_MAJOR_VERSION < 3
+    0,
+#else
+    0,
+#endif
+    (reprfunc) __Pyx_CyFunction_repr,
+    0,
+    0,
+    0,
+    0,
+    __Pyx_CyFunction_CallAsMethod,
+    0,
+    0,
+    0,
+    0,
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
+    0,
+    (traverseproc) __Pyx_CyFunction_traverse,
+    (inquiry) __Pyx_CyFunction_clear,
+    0,
+#if PY_VERSION_HEX < 0x030500A0
+    offsetof(__pyx_CyFunctionObject, func_weakreflist),
+#else
+    offsetof(PyCFunctionObject, m_weakreflist),
+#endif
+    0,
+    0,
+    __pyx_CyFunction_methods,
+    __pyx_CyFunction_members,
+    __pyx_CyFunction_getsets,
+    0,
+    0,
+    __Pyx_CyFunction_descr_get,
+    0,
+    offsetof(__pyx_CyFunctionObject, func_dict),
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+#if PY_VERSION_HEX >= 0x030400a1
+    0,
+#endif
+};
+static int __pyx_CyFunction_init(void) {
+    __pyx_CyFunctionType = __Pyx_FetchCommonType(&__pyx_CyFunctionType_type);
+    if (__pyx_CyFunctionType == NULL) {
+        return -1;
+    }
+    return 0;
+}
+static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *func, size_t size, int pyobjects) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults = PyObject_Malloc(size);
+    if (!m->defaults)
+        return PyErr_NoMemory();
+    memset(m->defaults, 0, size);
+    m->defaults_pyobjects = pyobjects;
+    return m->defaults;
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *func, PyObject *tuple) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults_tuple = tuple;
+    Py_INCREF(tuple);
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *func, PyObject *dict) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults_kwdict = dict;
+    Py_INCREF(dict);
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *func, PyObject *dict) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->func_annotations = dict;
+    Py_INCREF(dict);
+}
+
+/* BufferFallbackError */
+            static void __Pyx_RaiseBufferFallbackError(void) {
+  PyErr_SetString(PyExc_ValueError,
+     "Buffer acquisition failed on assignment; and then reacquiring the old buffer failed too!");
+}
+
+/* None */
+            static CYTHON_INLINE Py_ssize_t __Pyx_div_Py_ssize_t(Py_ssize_t a, Py_ssize_t b) {
+    Py_ssize_t q = a / b;
+    Py_ssize_t r = a - q*b;
+    q -= ((r != 0) & ((r ^ b) < 0));
+    return q;
+}
+
+/* BufferIndexError */
+            static void __Pyx_RaiseBufferIndexError(int axis) {
+  PyErr_Format(PyExc_IndexError,
+     "Out of bounds on buffer access (axis %d)", axis);
+}
+
+/* RaiseTooManyValuesToUnpack */
+            static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) {
+    PyErr_Format(PyExc_ValueError,
+                 "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected);
+}
+
+/* RaiseNeedMoreValuesToUnpack */
+            static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) {
+    PyErr_Format(PyExc_ValueError,
+                 "need more than %" CYTHON_FORMAT_SSIZE_T "d value%.1s to unpack",
+                 index, (index == 1) ? "" : "s");
+}
+
+/* RaiseNoneIterError */
+            static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+}
+
+/* SaveResetException */
+            #if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    *type = tstate->exc_type;
+    *value = tstate->exc_value;
+    *tb = tstate->exc_traceback;
+    Py_XINCREF(*type);
+    Py_XINCREF(*value);
+    Py_XINCREF(*tb);
+}
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = type;
+    tstate->exc_value = value;
+    tstate->exc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+#endif
+
+/* PyErrExceptionMatches */
+            #if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err) {
+    PyObject *exc_type = tstate->curexc_type;
+    if (exc_type == err) return 1;
+    if (unlikely(!exc_type)) return 0;
+    return PyErr_GivenExceptionMatches(exc_type, err);
+}
+#endif
+
+/* GetException */
+            #if CYTHON_FAST_THREAD_STATE
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb) {
+#endif
+    PyObject *local_type, *local_value, *local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    local_type = tstate->curexc_type;
+    local_value = tstate->curexc_value;
+    local_tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+#else
+    PyErr_Fetch(&local_type, &local_value, &local_tb);
+#endif
+    PyErr_NormalizeException(&local_type, &local_value, &local_tb);
+#if CYTHON_FAST_THREAD_STATE
+    if (unlikely(tstate->curexc_type))
+#else
+    if (unlikely(PyErr_Occurred()))
+#endif
+        goto bad;
+    #if PY_MAJOR_VERSION >= 3
+    if (local_tb) {
+        if (unlikely(PyException_SetTraceback(local_value, local_tb) < 0))
+            goto bad;
+    }
+    #endif
+    Py_XINCREF(local_tb);
+    Py_XINCREF(local_type);
+    Py_XINCREF(local_value);
+    *type = local_type;
+    *value = local_value;
+    *tb = local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = local_type;
+    tstate->exc_value = local_value;
+    tstate->exc_traceback = local_tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#else
+    PyErr_SetExcInfo(local_type, local_value, local_tb);
+#endif
+    return 0;
+bad:
+    *type = 0;
+    *value = 0;
+    *tb = 0;
+    Py_XDECREF(local_type);
+    Py_XDECREF(local_value);
+    Py_XDECREF(local_tb);
+    return -1;
+}
+
+/* Import */
+              static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *empty_list = 0;
+    PyObject *module = 0;
+    PyObject *global_dict = 0;
+    PyObject *empty_dict = 0;
+    PyObject *list;
+    #if PY_VERSION_HEX < 0x03030000
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (!py_import)
+        goto bad;
+    #endif
+    if (from_list)
+        list = from_list;
+    else {
+        empty_list = PyList_New(0);
+        if (!empty_list)
+            goto bad;
+        list = empty_list;
+    }
+    global_dict = PyModule_GetDict(__pyx_m);
+    if (!global_dict)
+        goto bad;
+    empty_dict = PyDict_New();
+    if (!empty_dict)
+        goto bad;
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if (strchr(__Pyx_MODULE_NAME, '.')) {
+                #if PY_VERSION_HEX < 0x03030000
+                PyObject *py_level = PyInt_FromLong(1);
+                if (!py_level)
+                    goto bad;
+                module = PyObject_CallFunctionObjArgs(py_import,
+                    name, global_dict, empty_dict, list, py_level, NULL);
+                Py_DECREF(py_level);
+                #else
+                module = PyImport_ImportModuleLevelObject(
+                    name, global_dict, empty_dict, list, 1);
+                #endif
+                if (!module) {
+                    if (!PyErr_ExceptionMatches(PyExc_ImportError))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0;
+        }
+        #endif
+        if (!module) {
+            #if PY_VERSION_HEX < 0x03030000
+            PyObject *py_level = PyInt_FromLong(level);
+            if (!py_level)
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, global_dict, empty_dict, list, py_level, NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, global_dict, empty_dict, list, level);
+            #endif
+        }
+    }
+bad:
+    #if PY_VERSION_HEX < 0x03030000
+    Py_XDECREF(py_import);
+    #endif
+    Py_XDECREF(empty_list);
+    Py_XDECREF(empty_dict);
+    return module;
+}
+
+/* CodeObjectCache */
+              static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = start + (end - start) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, (size_t)new_max*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+
+/* AddTraceback */
+              #include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyObject *py_srcfile = 0;
+    PyObject *py_funcname = 0;
+    #if PY_MAJOR_VERSION < 3
+    py_srcfile = PyString_FromString(filename);
+    #else
+    py_srcfile = PyUnicode_FromString(filename);
+    #endif
+    if (!py_srcfile) goto bad;
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        #else
+        py_funcname = PyUnicode_FromString(funcname);
+        #endif
+    }
+    if (!py_funcname) goto bad;
+    py_code = __Pyx_PyCode_New(
+        0,
+        0,
+        0,
+        0,
+        0,
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    Py_DECREF(py_funcname);
+    return py_code;
+bad:
+    Py_XDECREF(py_srcfile);
+    Py_XDECREF(py_funcname);
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyFrameObject *py_frame = 0;
+    py_code = __pyx_find_code_object(c_line ? c_line : py_line);
+    if (!py_code) {
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) goto bad;
+        __pyx_insert_code_object(c_line ? c_line : py_line, py_code);
+    }
+    py_frame = PyFrame_New(
+        PyThreadState_GET(), /*PyThreadState *tstate,*/
+        py_code,             /*PyCodeObject *code,*/
+        __pyx_d,      /*PyObject *globals,*/
+        0                    /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    __Pyx_PyFrame_SetLineNumber(py_frame, py_line);
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+
+#if PY_MAJOR_VERSION < 3
+static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
+    if (PyObject_CheckBuffer(obj)) return PyObject_GetBuffer(obj, view, flags);
+        if (PyObject_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) return __pyx_pw_5numpy_7ndarray_1__getbuffer__(obj, view, flags);
+    PyErr_Format(PyExc_TypeError, "'%.200s' does not have the buffer interface", Py_TYPE(obj)->tp_name);
+    return -1;
+}
+static void __Pyx_ReleaseBuffer(Py_buffer *view) {
+    PyObject *obj = view->obj;
+    if (!obj) return;
+    if (PyObject_CheckBuffer(obj)) {
+        PyBuffer_Release(view);
+        return;
+    }
+        if (PyObject_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) { __pyx_pw_5numpy_7ndarray_3__releasebuffer__(obj, view); return; }
+    Py_DECREF(obj);
+    view->obj = NULL;
+}
+#endif
+
+
+              /* CIntFromPyVerify */
+              #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0)
+#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1)
+#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\
+    {\
+        func_type value = func_value;\
+        if (sizeof(target_type) < sizeof(func_type)) {\
+            if (unlikely(value != (func_type) (target_type) value)) {\
+                func_type zero = 0;\
+                if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\
+                    return (target_type) -1;\
+                if (is_unsigned && unlikely(value < zero))\
+                    goto raise_neg_overflow;\
+                else\
+                    goto raise_overflow;\
+            }\
+        }\
+        return (target_type) value;\
+    }
+
+/* CIntToPy */
+              static CYTHON_INLINE PyObject* __Pyx_PyInt_From_siz(siz value) {
+    const siz neg_one = (siz) -1, const_zero = (siz) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(siz) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(siz) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(siz) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(siz) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(siz) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(siz),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntToPy */
+              static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+    const long neg_one = (long) -1, const_zero = (long) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntToPy */
+              static CYTHON_INLINE PyObject* __Pyx_PyInt_From_Py_intptr_t(Py_intptr_t value) {
+    const Py_intptr_t neg_one = (Py_intptr_t) -1, const_zero = (Py_intptr_t) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(Py_intptr_t) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(Py_intptr_t) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(Py_intptr_t) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(Py_intptr_t) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(Py_intptr_t) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(Py_intptr_t),
+                                     little, !is_unsigned);
+    }
+}
+
+/* Declarations */
+              #if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return ::std::complex< float >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return x + y*(__pyx_t_float_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      __pyx_t_float_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+              #if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabsf(b.real) >= fabsf(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                float r = b.imag / b.real;
+                float s = 1.0 / (b.real + b.imag * r);
+                return __pyx_t_float_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            float r = b.real / b.imag;
+            float s = 1.0 / (b.imag + b.real * r);
+            return __pyx_t_float_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            float denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_float_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtf(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotf(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+            __pyx_t_float_complex z;
+            float r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    float denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if (b.imag == 0) {
+                    z.real = powf(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2f(0, -1);
+                }
+            } else {
+                r = __Pyx_c_abs_float(a);
+                theta = atan2f(a.imag, a.real);
+            }
+            lnr = logf(r);
+            z_r = expf(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosf(z_theta);
+            z.imag = z_r * sinf(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* Declarations */
+              #if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return ::std::complex< double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return x + y*(__pyx_t_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      __pyx_t_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+              #if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabs(b.real) >= fabs(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                double r = b.imag / b.real;
+                double s = 1.0 / (b.real + b.imag * r);
+                return __pyx_t_double_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            double r = b.real / b.imag;
+            double s = 1.0 / (b.imag + b.real * r);
+            return __pyx_t_double_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            double denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_double_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrt(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypot(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+            __pyx_t_double_complex z;
+            double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if (b.imag == 0) {
+                    z.real = pow(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2(0, -1);
+                }
+            } else {
+                r = __Pyx_c_abs_double(a);
+                theta = atan2(a.imag, a.real);
+            }
+            lnr = log(r);
+            z_r = exp(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cos(z_theta);
+            z.imag = z_r * sin(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* CIntToPy */
+              static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
+    const int neg_one = (int) -1, const_zero = (int) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(int) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(int) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(int) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(int),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntToPy */
+              static CYTHON_INLINE PyObject* __Pyx_PyInt_From_enum__NPY_TYPES(enum NPY_TYPES value) {
+    const enum NPY_TYPES neg_one = (enum NPY_TYPES) -1, const_zero = (enum NPY_TYPES) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(enum NPY_TYPES) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(enum NPY_TYPES) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(enum NPY_TYPES) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(enum NPY_TYPES) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(enum NPY_TYPES) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(enum NPY_TYPES),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPy */
+              static CYTHON_INLINE siz __Pyx_PyInt_As_siz(PyObject *x) {
+    const siz neg_one = (siz) -1, const_zero = (siz) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(siz) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(siz, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (siz) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (siz) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(siz, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(siz) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) >= 2 * PyLong_SHIFT) {
+                            return (siz) (((((siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(siz) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) >= 3 * PyLong_SHIFT) {
+                            return (siz) (((((((siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(siz) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) >= 4 * PyLong_SHIFT) {
+                            return (siz) (((((((((siz)digits[3]) << PyLong_SHIFT) | (siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (siz) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(siz) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(siz, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(siz) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(siz, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (siz) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(siz, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(siz,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(siz) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 2 * PyLong_SHIFT) {
+                            return (siz) (((siz)-1)*(((((siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(siz) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 2 * PyLong_SHIFT) {
+                            return (siz) ((((((siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(siz) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 3 * PyLong_SHIFT) {
+                            return (siz) (((siz)-1)*(((((((siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(siz) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 3 * PyLong_SHIFT) {
+                            return (siz) ((((((((siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(siz) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 4 * PyLong_SHIFT) {
+                            return (siz) (((siz)-1)*(((((((((siz)digits[3]) << PyLong_SHIFT) | (siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(siz) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(siz, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(siz) - 1 > 4 * PyLong_SHIFT) {
+                            return (siz) ((((((((((siz)digits[3]) << PyLong_SHIFT) | (siz)digits[2]) << PyLong_SHIFT) | (siz)digits[1]) << PyLong_SHIFT) | (siz)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(siz) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(siz, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(siz) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(siz, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            siz val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (siz) -1;
+        }
+    } else {
+        siz val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (siz) -1;
+        val = __Pyx_PyInt_As_siz(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to siz");
+    return (siz) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to siz");
+    return (siz) -1;
+}
+
+/* CIntFromPy */
+              static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *x) {
+    const size_t neg_one = (size_t) -1, const_zero = (size_t) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(size_t) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(size_t, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (size_t) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (size_t) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(size_t, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(size_t) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) >= 2 * PyLong_SHIFT) {
+                            return (size_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(size_t) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) >= 3 * PyLong_SHIFT) {
+                            return (size_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(size_t) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) >= 4 * PyLong_SHIFT) {
+                            return (size_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (size_t) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(size_t) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(size_t) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (size_t) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(size_t, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(size_t,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(size_t) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
+                            return (size_t) (((size_t)-1)*(((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(size_t) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
+                            return (size_t) ((((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
+                            return (size_t) (((size_t)-1)*(((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(size_t) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
+                            return (size_t) ((((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 4 * PyLong_SHIFT) {
+                            return (size_t) (((size_t)-1)*(((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(size_t) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 4 * PyLong_SHIFT) {
+                            return (size_t) ((((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(size_t) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(size_t) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            size_t val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (size_t) -1;
+        }
+    } else {
+        size_t val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (size_t) -1;
+        val = __Pyx_PyInt_As_size_t(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to size_t");
+    return (size_t) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to size_t");
+    return (size_t) -1;
+}
+
+/* CIntFromPy */
+              static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+    const int neg_one = (int) -1, const_zero = (int) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(int) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (int) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) {
+                            return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) {
+                            return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) {
+                            return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (int) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(int) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(int,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(int) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            int val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (int) -1;
+        }
+    } else {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to int");
+    return (int) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to int");
+    return (int) -1;
+}
+
+/* CIntFromPy */
+              static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+    const long neg_one = (long) -1, const_zero = (long) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(long) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (long) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(long, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 2 * PyLong_SHIFT) {
+                            return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 3 * PyLong_SHIFT) {
+                            return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 4 * PyLong_SHIFT) {
+                            return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (long) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(long) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(long, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(long,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(long) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(long) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            long val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (long) -1;
+        }
+    } else {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to long");
+    return (long) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to long");
+    return (long) -1;
+}
+
+/* CheckBinaryVersion */
+              static int __Pyx_check_binary_version(void) {
+    char ctversion[4], rtversion[4];
+    PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION);
+    PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion());
+    if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) {
+        char message[200];
+        PyOS_snprintf(message, sizeof(message),
+                      "compiletime version %s of module '%.100s' "
+                      "does not match runtime version %s",
+                      ctversion, __Pyx_MODULE_NAME, rtversion);
+        return PyErr_WarnEx(NULL, message, 1);
+    }
+    return 0;
+}
+
+/* ModuleImport */
+              #ifndef __PYX_HAVE_RT_ImportModule
+#define __PYX_HAVE_RT_ImportModule
+static PyObject *__Pyx_ImportModule(const char *name) {
+    PyObject *py_name = 0;
+    PyObject *py_module = 0;
+    py_name = __Pyx_PyIdentifier_FromString(name);
+    if (!py_name)
+        goto bad;
+    py_module = PyImport_Import(py_name);
+    Py_DECREF(py_name);
+    return py_module;
+bad:
+    Py_XDECREF(py_name);
+    return 0;
+}
+#endif
+
+/* TypeImport */
+              #ifndef __PYX_HAVE_RT_ImportType
+#define __PYX_HAVE_RT_ImportType
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name,
+    size_t size, int strict)
+{
+    PyObject *py_module = 0;
+    PyObject *result = 0;
+    PyObject *py_name = 0;
+    char warning[200];
+    Py_ssize_t basicsize;
+#ifdef Py_LIMITED_API
+    PyObject *py_basicsize;
+#endif
+    py_module = __Pyx_ImportModule(module_name);
+    if (!py_module)
+        goto bad;
+    py_name = __Pyx_PyIdentifier_FromString(class_name);
+    if (!py_name)
+        goto bad;
+    result = PyObject_GetAttr(py_module, py_name);
+    Py_DECREF(py_name);
+    py_name = 0;
+    Py_DECREF(py_module);
+    py_module = 0;
+    if (!result)
+        goto bad;
+    if (!PyType_Check(result)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s.%.200s is not a type object",
+            module_name, class_name);
+        goto bad;
+    }
+#ifndef Py_LIMITED_API
+    basicsize = ((PyTypeObject *)result)->tp_basicsize;
+#else
+    py_basicsize = PyObject_GetAttrString(result, "__basicsize__");
+    if (!py_basicsize)
+        goto bad;
+    basicsize = PyLong_AsSsize_t(py_basicsize);
+    Py_DECREF(py_basicsize);
+    py_basicsize = 0;
+    if (basicsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+#endif
+    if (!strict && (size_t)basicsize > size) {
+        PyOS_snprintf(warning, sizeof(warning),
+            "%s.%s size changed, may indicate binary incompatibility. Expected %zd, got %zd",
+            module_name, class_name, basicsize, size);
+        if (PyErr_WarnEx(NULL, warning, 0) < 0) goto bad;
+    }
+    else if ((size_t)basicsize != size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s has the wrong size, try recompiling. Expected %zd, got %zd",
+            module_name, class_name, basicsize, size);
+        goto bad;
+    }
+    return (PyTypeObject *)result;
+bad:
+    Py_XDECREF(py_module);
+    Py_XDECREF(result);
+    return NULL;
+}
+#endif
+
+/* InitStrings */
+              static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION < 3
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        #else
+        if (t->is_unicode | t->is_str) {
+            if (t->intern) {
+                *t->p = PyUnicode_InternFromString(t->s);
+            } else if (t->encoding) {
+                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
+            } else {
+                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+            }
+        } else {
+            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
+        }
+        #endif
+        if (!*t->p)
+            return -1;
+        ++t;
+    }
+    return 0;
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) {
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, (Py_ssize_t)strlen(c_str));
+}
+static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if CYTHON_COMPILING_IN_CPYTHON && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+#if PY_VERSION_HEX < 0x03030000
+        char* defenc_c;
+        PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+        if (!defenc) return NULL;
+        defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+        {
+            char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+            char* c;
+            for (c = defenc_c; c < end; c++) {
+                if ((unsigned char) (*c) >= 128) {
+                    PyUnicode_AsASCIIString(o);
+                    return NULL;
+                }
+            }
+        }
+#endif
+        *length = PyBytes_GET_SIZE(defenc);
+        return defenc_c;
+#else
+        if (__Pyx_PyUnicode_READY(o) == -1) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+        if (PyUnicode_IS_ASCII(o)) {
+            *length = PyUnicode_GET_LENGTH(o);
+            return PyUnicode_AsUTF8(o);
+        } else {
+            PyUnicode_AsASCIIString(o);
+            return NULL;
+        }
+#else
+        return PyUnicode_AsUTF8AndSize(o, length);
+#endif
+#endif
+    } else
+#endif
+#if (!CYTHON_COMPILING_IN_PYPY) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE))
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) {
+#if CYTHON_USE_TYPE_SLOTS
+  PyNumberMethods *m;
+#endif
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (PyInt_Check(x) || PyLong_Check(x))
+#else
+  if (PyLong_Check(x))
+#endif
+    return __Pyx_NewRef(x);
+#if CYTHON_USE_TYPE_SLOTS
+  m = Py_TYPE(x)->tp_as_number;
+  #if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = PyNumber_Int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = PyNumber_Long(x);
+  }
+  #else
+  if (m && m->nb_int) {
+    name = "int";
+    res = PyNumber_Long(x);
+  }
+  #endif
+#else
+  res = PyNumber_Int(x);
+#endif
+  if (res) {
+#if PY_MAJOR_VERSION < 3
+    if (!PyInt_Check(res) && !PyLong_Check(res)) {
+#else
+    if (!PyLong_Check(res)) {
+#endif
+      PyErr_Format(PyExc_TypeError,
+                   "__%.4s__ returned non-%.4s (type %.200s)",
+                   name, name, Py_TYPE(res)->tp_name);
+      Py_DECREF(res);
+      return NULL;
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b))) {
+    if (sizeof(Py_ssize_t) >= sizeof(long))
+        return PyInt_AS_LONG(b);
+    else
+        return PyInt_AsSsize_t(x);
+  }
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_USE_PYLONG_INTERNALS
+    const digit* digits = ((PyLongObject*)b)->ob_digit;
+    const Py_ssize_t size = Py_SIZE(b);
+    if (likely(__Pyx_sst_abs(size) <= 1)) {
+        ival = likely(size) ? digits[0] : 0;
+        if (size == -1) ival = -ival;
+        return ival;
+    } else {
+      switch (size) {
+         case 2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+      }
+    }
+    #endif
+    return PyLong_AsSsize_t(b);
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+    return PyInt_FromSize_t(ival);
+}
+
+
+#endif /* Py_PYTHON_H */
diff --git a/examples/faster-rcnn/lib/pycocotools/_mask.pyx b/examples/faster-rcnn/lib/pycocotools/_mask.pyx
new file mode 100644
index 000000000..e08f1f480
--- /dev/null
+++ b/examples/faster-rcnn/lib/pycocotools/_mask.pyx
@@ -0,0 +1,291 @@
+# distutils: language = c
+# distutils: sources = ../MatlabAPI/private/maskApi.c
+
+#**************************************************************************
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+# Licensed under the Simplified BSD License [see coco/license.txt]
+#**************************************************************************
+
+__author__ = 'tsungyi'
+
+# import both Python-level and C-level symbols of Numpy
+# the API uses Numpy to interface C and Python
+import numpy as np
+cimport numpy as np
+from libc.stdlib cimport malloc, free
+
+# intialized Numpy. must do.
+np.import_array()
+
+# import numpy C function
+# we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management
+cdef extern from "numpy/arrayobject.h":
+    void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
+
+# Declare the prototype of the C functions in MaskApi.h
+cdef extern from "maskApi.h":
+    ctypedef unsigned int uint
+    ctypedef unsigned long siz
+    ctypedef unsigned char byte
+    ctypedef double* BB
+    ctypedef struct RLE:
+        siz h,
+        siz w,
+        siz m,
+        uint* cnts,
+    void rlesInit( RLE **R, siz n )
+    void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n )
+    void rleDecode( const RLE *R, byte *mask, siz n )
+    void rleMerge( const RLE *R, RLE *M, siz n, bint intersect )
+    void rleArea( const RLE *R, siz n, uint *a )
+    void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o )
+    void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o )
+    void rleToBbox( const RLE *R, BB bb, siz n )
+    void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n )
+    void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w )
+    char* rleToString( const RLE *R )
+    void rleFrString( RLE *R, char *s, siz h, siz w )
+
+# python class to wrap RLE array in C
+# the class handles the memory allocation and deallocation
+cdef class RLEs:
+    cdef RLE *_R
+    cdef siz _n
+
+    def __cinit__(self, siz n =0):
+        rlesInit(&self._R, n)
+        self._n = n
+
+    # free the RLE array here
+    def __dealloc__(self):
+        if self._R is not NULL:
+            for i in range(self._n):
+                free(self._R[i].cnts)
+            free(self._R)
+    def __getattr__(self, key):
+        if key == 'n':
+            return self._n
+        raise AttributeError(key)
+
+# python class to wrap Mask array in C
+# the class handles the memory allocation and deallocation
+cdef class Masks:
+    cdef byte *_mask
+    cdef siz _h
+    cdef siz _w
+    cdef siz _n
+
+    def __cinit__(self, h, w, n):
+        self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+        self._h = h
+        self._w = w
+        self._n = n
+    # def __dealloc__(self):
+        # the memory management of _mask has been passed to np.ndarray
+        # it doesn't need to be freed here
+
+    # called when passing into np.array() and return an np.ndarray in column-major order
+    def __array__(self):
+        cdef np.npy_intp shape[1]
+        shape[0] = <np.npy_intp> self._h*self._w*self._n
+        # Create a 1D array, and reshape it to fortran/Matlab column-major array
+        ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')
+        # The _mask allocated by Masks is now handled by ndarray
+        PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)
+        return ndarray
+
+# internal conversion from Python RLEs object to compressed RLE format
+def _toString(RLEs Rs):
+    cdef siz n = Rs.n
+    cdef bytes py_string
+    cdef char* c_string
+    objs = []
+    for i in range(n):
+        c_string = rleToString( <RLE*> &Rs._R[i] )
+        py_string = c_string
+        objs.append({
+            'size': [Rs._R[i].h, Rs._R[i].w],
+            'counts': py_string
+        })
+        free(c_string)
+    return objs
+
+# internal conversion from compressed RLE format to Python RLEs object
+def _frString(rleObjs):
+    cdef siz n = len(rleObjs)
+    Rs = RLEs(n)
+    cdef bytes py_string
+    cdef char* c_string
+    for i, obj in enumerate(rleObjs):
+        py_string = str(obj['counts'])
+        c_string = py_string
+        rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
+    return Rs
+
+# encode mask to RLEs objects
+# list of RLE string can be generated by RLEs member function
+def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):
+    h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+    cdef RLEs Rs = RLEs(n)
+    rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+    objs = _toString(Rs)
+    return objs
+
+# decode mask from compressed list of RLE string or RLEs object
+def decode(rleObjs):
+    cdef RLEs Rs = _frString(rleObjs)
+    h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+    masks = Masks(h, w, n)
+    rleDecode( <RLE*>Rs._R, masks._mask, n );
+    return np.array(masks)
+
+def merge(rleObjs, bint intersect=0):
+    cdef RLEs Rs = _frString(rleObjs)
+    cdef RLEs R = RLEs(1)
+    rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+    obj = _toString(R)[0]
+    return obj
+
+def area(rleObjs):
+    cdef RLEs Rs = _frString(rleObjs)
+    cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+    rleArea(Rs._R, Rs._n, _a)
+    cdef np.npy_intp shape[1]
+    shape[0] = <np.npy_intp> Rs._n
+    a = np.array((Rs._n, ), dtype=np.uint8)
+    a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+    PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
+    return a
+
+# iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+def iou( dt, gt, pyiscrowd ):
+    def _preproc(objs):
+        if len(objs) == 0:
+            return objs
+        if type(objs) == np.ndarray:
+            if len(objs.shape) == 1:
+                objs = objs.reshape((objs[0], 1))
+            # check if it's Nx4 bbox
+            if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+                raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+            objs = objs.astype(np.double)
+        elif type(objs) == list:
+            # check if list is in box format and convert it to np.ndarray
+            isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+            isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+            if isbox:
+                objs = np.array(objs, dtype=np.double)
+                if len(objs.shape) == 1:
+                    objs = objs.reshape((1,objs.shape[0]))
+            elif isrle:
+                objs = _frString(objs)
+            else:
+                raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
+        else:
+            raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+        return objs
+    def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+        rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+    def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+        bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+    def _len(obj):
+        cdef siz N = 0
+        if type(obj) == RLEs:
+            N = obj.n
+        elif len(obj)==0:
+            pass
+        elif type(obj) == np.ndarray:
+            N = obj.shape[0]
+        return N
+    # convert iscrowd to numpy array
+    cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8)
+    # simple type checking
+    cdef siz m, n
+    dt = _preproc(dt)
+    gt = _preproc(gt)
+    m = _len(dt)
+    n = _len(gt)
+    if m == 0 or n == 0:
+        return []
+    if not type(dt) == type(gt):
+        raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
+
+    # define local variables
+    cdef double* _iou = <double*> 0
+    cdef np.npy_intp shape[1]
+    # check type and assign iou function
+    if type(dt) == RLEs:
+        _iouFun = _rleIou
+    elif type(dt) == np.ndarray:
+        _iouFun = _bbIou
+    else:
+        raise Exception('input data type not allowed.')
+    _iou = <double*> malloc(m*n* sizeof(double))
+    iou = np.zeros((m*n, ), dtype=np.double)
+    shape[0] = <np.npy_intp> m*n
+    iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+    PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+    _iouFun(dt, gt, iscrowd, m, n, iou)
+    return iou.reshape((m,n), order='F')
+
+def toBbox( rleObjs ):
+    cdef RLEs Rs = _frString(rleObjs)
+    cdef siz n = Rs.n
+    cdef BB _bb = <BB> malloc(4*n* sizeof(double))
+    rleToBbox( <const RLE*> Rs._R, _bb, n )
+    cdef np.npy_intp shape[1]
+    shape[0] = <np.npy_intp> 4*n
+    bb = np.array((1,4*n), dtype=np.double)
+    bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+    PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
+    return bb
+
+def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
+    cdef siz n = bb.shape[0]
+    Rs = RLEs(n)
+    rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+    objs = _toString(Rs)
+    return objs
+
+def frPoly( poly, siz h, siz w ):
+    cdef np.ndarray[np.double_t, ndim=1] np_poly
+    n = len(poly)
+    Rs = RLEs(n)
+    for i, p in enumerate(poly):
+        np_poly = np.array(p, dtype=np.double, order='F')
+        rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, len(np_poly)/2, h, w )
+    objs = _toString(Rs)
+    return objs
+
+def frUncompressedRLE(ucRles, siz h, siz w):
+    cdef np.ndarray[np.uint32_t, ndim=1] cnts
+    cdef RLE R
+    cdef uint *data
+    n = len(ucRles)
+    objs = []
+    for i in range(n):
+        Rs = RLEs(1)
+        cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+        # time for malloc can be saved here but it's fine
+        data = <uint*> malloc(len(cnts)* sizeof(uint))
+        for j in range(len(cnts)):
+            data[j] = <uint> cnts[j]
+        R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+        Rs._R[0] = R
+        objs.append(_toString(Rs)[0])
+    return objs
+
+def frPyObjects(pyobj, siz h, w):
+    if type(pyobj) == np.ndarray:
+        objs = frBbox(pyobj, h, w )
+    elif type(pyobj) == list and len(pyobj[0]) == 4:
+        objs = frBbox(pyobj, h, w )
+    elif type(pyobj) == list and len(pyobj[0]) > 4:
+        objs = frPoly(pyobj, h, w )
+    elif type(pyobj) == list and type(pyobj[0]) == dict:
+        objs = frUncompressedRLE(pyobj, h, w)
+    else:
+        raise Exception('input type is not supported.')
+    return objs
diff --git a/examples/faster-rcnn/lib/pycocotools/coco.py b/examples/faster-rcnn/lib/pycocotools/coco.py
new file mode 100644
index 000000000..5d9f6b826
--- /dev/null
+++ b/examples/faster-rcnn/lib/pycocotools/coco.py
@@ -0,0 +1,351 @@
+__author__ = 'tylin'
+__version__ = '1.0.1'
+# Interface for accessing the Microsoft COCO dataset.
+
+# Microsoft COCO is a large image dataset designed for object detection,
+# segmentation, and caption generation. pycocotools is a Python API that
+# assists in loading, parsing and visualizing the annotations in COCO.
+# Please visit http://mscoco.org/ for more information on COCO, including
+# for the data, paper, and tutorials. The exact format of the annotations
+# is also described on the COCO website. For example usage of the pycocotools
+# please see pycocotools_demo.ipynb. In addition to this API, please download both
+# the COCO images and annotations in order to run the demo.
+
+# An alternative to using the API is to load the annotations directly
+# into Python dictionary
+# Using the API provides additional utility functions. Note that this API
+# supports both *instance* and *caption* annotations. In the case of
+# captions not all functions are defined (e.g. categories are undefined).
+
+# The following API functions are defined:
+#  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
+#  decodeMask - Decode binary mask M encoded via run-length encoding.
+#  encodeMask - Encode binary mask M using run-length encoding.
+#  getAnnIds  - Get ann ids that satisfy given filter conditions.
+#  getCatIds  - Get cat ids that satisfy given filter conditions.
+#  getImgIds  - Get img ids that satisfy given filter conditions.
+#  loadAnns   - Load anns with the specified ids.
+#  loadCats   - Load cats with the specified ids.
+#  loadImgs   - Load imgs with the specified ids.
+#  segToMask  - Convert polygon segmentation to binary mask.
+#  showAnns   - Display the specified annotations.
+#  loadRes    - Load algorithm results and create API for accessing them.
+#  download   - Download COCO images from mscoco.org server.
+# Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
+# Help on each functions can be accessed by: "help COCO>function".
+
+# See also COCO>decodeMask,
+# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
+# COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
+# COCO>loadImgs, COCO>segToMask, COCO>showAnns
+
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
+# Licensed under the Simplified BSD License [see bsd.txt]
+
+import json
+import datetime
+import time
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon
+import numpy as np
+from skimage.draw import polygon
+import urllib
+import copy
+import itertools
+import mask
+import os
+
+class COCO:
+    def __init__(self, annotation_file=None):
+        """
+        Constructor of Microsoft COCO helper class for reading and visualizing annotations.
+        :param annotation_file (str): location of annotation file
+        :param image_folder (str): location to the folder that hosts images.
+        :return:
+        """
+        # load dataset
+        self.dataset = {}
+        self.anns = []
+        self.imgToAnns = {}
+        self.catToImgs = {}
+        self.imgs = {}
+        self.cats = {}
+        if not annotation_file == None:
+            print 'loading annotations into memory...'
+            tic = time.time()
+            dataset = json.load(open(annotation_file, 'r'))
+            print 'Done (t=%0.2fs)'%(time.time()- tic)
+            self.dataset = dataset
+            self.createIndex()
+
+    def createIndex(self):
+        # create index
+        print 'creating index...'
+        anns = {}
+        imgToAnns = {}
+        catToImgs = {}
+        cats = {}
+        imgs = {}
+        if 'annotations' in self.dataset:
+            imgToAnns = {ann['image_id']: [] for ann in self.dataset['annotations']}
+            anns =      {ann['id']:       [] for ann in self.dataset['annotations']}
+            for ann in self.dataset['annotations']:
+                imgToAnns[ann['image_id']] += [ann]
+                anns[ann['id']] = ann
+
+        if 'images' in self.dataset:
+            imgs      = {im['id']: {} for im in self.dataset['images']}
+            for img in self.dataset['images']:
+                imgs[img['id']] = img
+
+        if 'categories' in self.dataset:
+            cats = {cat['id']: [] for cat in self.dataset['categories']}
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+            catToImgs = {cat['id']: [] for cat in self.dataset['categories']}
+            if 'annotations' in self.dataset:
+                for ann in self.dataset['annotations']:
+                    catToImgs[ann['category_id']] += [ann['image_id']]
+
+        print 'index created!'
+
+        # create class members
+        self.anns = anns
+        self.imgToAnns = imgToAnns
+        self.catToImgs = catToImgs
+        self.imgs = imgs
+        self.cats = cats
+
+    def info(self):
+        """
+        Print information about the annotation file.
+        :return:
+        """
+        for key, value in self.dataset['info'].items():
+            print '%s: %s'%(key, value)
+
+    def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
+        """
+        Get ann ids that satisfy given filter conditions. default skips that filter
+        :param imgIds  (int array)     : get anns for given imgs
+               catIds  (int array)     : get anns for given cats
+               areaRng (float array)   : get anns for given area range (e.g. [0 inf])
+               iscrowd (boolean)       : get anns for given crowd label (False or True)
+        :return: ids (int array)       : integer array of ann ids
+        """
+        imgIds = imgIds if type(imgIds) == list else [imgIds]
+        catIds = catIds if type(catIds) == list else [catIds]
+
+        if len(imgIds) == len(catIds) == len(areaRng) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(imgIds) == 0:
+                # this can be changed by defaultdict
+                lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.dataset['annotations']
+            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
+            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
+        if not iscrowd == None:
+            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+        else:
+            ids = [ann['id'] for ann in anns]
+        return ids
+
+    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
+        """
+        filtering parameters. default skips that filter.
+        :param catNms (str array)  : get cats for given cat names
+        :param supNms (str array)  : get cats for given supercategory names
+        :param catIds (int array)  : get cats for given cat ids
+        :return: ids (int array)   : integer array of cat ids
+        """
+        catNms = catNms if type(catNms) == list else [catNms]
+        supNms = supNms if type(supNms) == list else [supNms]
+        catIds = catIds if type(catIds) == list else [catIds]
+
+        if len(catNms) == len(supNms) == len(catIds) == 0:
+            cats = self.dataset['categories']
+        else:
+            cats = self.dataset['categories']
+            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
+            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
+            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
+        ids = [cat['id'] for cat in cats]
+        return ids
+
+    def getImgIds(self, imgIds=[], catIds=[]):
+        '''
+        Get img ids that satisfy given filter conditions.
+        :param imgIds (int array) : get imgs for given ids
+        :param catIds (int array) : get imgs with all given cats
+        :return: ids (int array)  : integer array of img ids
+        '''
+        imgIds = imgIds if type(imgIds) == list else [imgIds]
+        catIds = catIds if type(catIds) == list else [catIds]
+
+        if len(imgIds) == len(catIds) == 0:
+            ids = self.imgs.keys()
+        else:
+            ids = set(imgIds)
+            for i, catId in enumerate(catIds):
+                if i == 0 and len(ids) == 0:
+                    ids = set(self.catToImgs[catId])
+                else:
+                    ids &= set(self.catToImgs[catId])
+        return list(ids)
+
+    def loadAnns(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying anns
+        :return: anns (object array) : loaded ann objects
+        """
+        if type(ids) == list:
+            return [self.anns[id] for id in ids]
+        elif type(ids) == int:
+            return [self.anns[ids]]
+
+    def loadCats(self, ids=[]):
+        """
+        Load cats with the specified ids.
+        :param ids (int array)       : integer ids specifying cats
+        :return: cats (object array) : loaded cat objects
+        """
+        if type(ids) == list:
+            return [self.cats[id] for id in ids]
+        elif type(ids) == int:
+            return [self.cats[ids]]
+
+    def loadImgs(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying img
+        :return: imgs (object array) : loaded img objects
+        """
+        if type(ids) == list:
+            return [self.imgs[id] for id in ids]
+        elif type(ids) == int:
+            return [self.imgs[ids]]
+
+    def showAnns(self, anns):
+        """
+        Display the specified annotations.
+        :param anns (array of object): annotations to display
+        :return: None
+        """
+        if len(anns) == 0:
+            return 0
+        if 'segmentation' in anns[0]:
+            datasetType = 'instances'
+        elif 'caption' in anns[0]:
+            datasetType = 'captions'
+        if datasetType == 'instances':
+            ax = plt.gca()
+            polygons = []
+            color = []
+            for ann in anns:
+                c = np.random.random((1, 3)).tolist()[0]
+                if type(ann['segmentation']) == list:
+                    # polygon
+                    for seg in ann['segmentation']:
+                        poly = np.array(seg).reshape((len(seg)/2, 2))
+                        polygons.append(Polygon(poly, True,alpha=0.4))
+                        color.append(c)
+                else:
+                    # mask
+                    t = self.imgs[ann['image_id']]
+                    if type(ann['segmentation']['counts']) == list:
+                        rle = mask.frPyObjects([ann['segmentation']], t['height'], t['width'])
+                    else:
+                        rle = [ann['segmentation']]
+                    m = mask.decode(rle)
+                    img = np.ones( (m.shape[0], m.shape[1], 3) )
+                    if ann['iscrowd'] == 1:
+                        color_mask = np.array([2.0,166.0,101.0])/255
+                    if ann['iscrowd'] == 0:
+                        color_mask = np.random.random((1, 3)).tolist()[0]
+                    for i in range(3):
+                        img[:,:,i] = color_mask[i]
+                    ax.imshow(np.dstack( (img, m*0.5) ))
+            p = PatchCollection(polygons, facecolors=color, edgecolors=(0,0,0,1), linewidths=3, alpha=0.4)
+            ax.add_collection(p)
+        elif datasetType == 'captions':
+            for ann in anns:
+                print ann['caption']
+
+    def loadRes(self, resFile):
+        """
+        Load result file and return a result api object.
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = COCO()
+        res.dataset['images'] = [img for img in self.dataset['images']]
+        # res.dataset['info'] = copy.deepcopy(self.dataset['info'])
+        # res.dataset['licenses'] = copy.deepcopy(self.dataset['licenses'])
+
+        print 'Loading and preparing results...     '
+        tic = time.time()
+        anns    = json.load(open(resFile))
+        assert type(anns) == list, 'results in not an array of objects'
+        annsImgIds = [ann['image_id'] for ann in anns]
+        assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
+               'Results do not correspond to current coco set'
+        if 'caption' in anns[0]:
+            imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
+            res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
+            for id, ann in enumerate(anns):
+                ann['id'] = id+1
+        elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                bb = ann['bbox']
+                x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
+                if not 'segmentation' in ann:
+                    ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+                ann['area'] = bb[2]*bb[3]
+                ann['id'] = id+1
+                ann['iscrowd'] = 0
+        elif 'segmentation' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                # now only support compressed RLE format as segmentation results
+                ann['area'] = mask.area([ann['segmentation']])[0]
+                if not 'bbox' in ann:
+                    ann['bbox'] = mask.toBbox([ann['segmentation']])[0]
+                ann['id'] = id+1
+                ann['iscrowd'] = 0
+        print 'DONE (t=%0.2fs)'%(time.time()- tic)
+
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
+
+    def download( self, tarDir = None, imgIds = [] ):
+        '''
+        Download COCO images from mscoco.org server.
+        :param tarDir (str): COCO results directory name
+               imgIds (list): images to be downloaded
+        :return:
+        '''
+        if tarDir is None:
+            print 'Please specify target directory'
+            return -1
+        if len(imgIds) == 0:
+            imgs = self.imgs.values()
+        else:
+            imgs = self.loadImgs(imgIds)
+        N = len(imgs)
+        if not os.path.exists(tarDir):
+            os.makedirs(tarDir)
+        for i, img in enumerate(imgs):
+            tic = time.time()
+            fname = os.path.join(tarDir, img['file_name'])
+            if not os.path.exists(fname):
+                urllib.urlretrieve(img['coco_url'], fname)
+            print 'downloaded %d/%d images (t=%.1fs)'%(i, N, time.time()- tic)
diff --git a/examples/faster-rcnn/lib/pycocotools/cocoeval.py b/examples/faster-rcnn/lib/pycocotools/cocoeval.py
new file mode 100644
index 000000000..f389eb0f2
--- /dev/null
+++ b/examples/faster-rcnn/lib/pycocotools/cocoeval.py
@@ -0,0 +1,444 @@
+__author__ = 'tsungyi'
+
+import numpy as np
+import datetime
+import time
+from collections import defaultdict
+import mask
+import copy
+
+class COCOeval:
+    # Interface for evaluating detection on the Microsoft COCO dataset.
+    #
+    # The usage for CocoEval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    # For example usage see evalDemo.m and http://mscoco.org/.
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N img ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  useSegm    - [1] if true evaluate against ground-truth segments
+    #  useCats    - [1] if true use category labels for evaluation    # Note: if useSegm=0 the evaluation is run on bounding boxes.
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    #
+    # Microsoft COCO Toolbox.      version 2.0
+    # Data, paper, and tutorials available at:  http://mscoco.org/
+    # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+    # Licensed under the Simplified BSD License [see coco/license.txt]
+    def __init__(self, cocoGt=None, cocoDt=None):
+        '''
+        Initialize CocoEval using coco APIs for gt and dt
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        '''
+        self.cocoGt   = cocoGt              # ground truth COCO API
+        self.cocoDt   = cocoDt              # detections COCO API
+        self.params   = {}                  # evaluation parameters
+        self.evalImgs = defaultdict(list)   # per-image per-category evaluation results [KxAxI] elements
+        self.eval     = {}                  # accumulated evaluation results
+        self._gts = defaultdict(list)       # gt for evaluation
+        self._dts = defaultdict(list)       # dt for evaluation
+        self.params = Params()              # parameters
+        self._paramsEval = {}               # parameters for evaluation
+        self.stats = []                     # result summarization
+        self.ious = {}                      # ious between all gts and dts
+        if not cocoGt is None:
+            self.params.imgIds = sorted(cocoGt.getImgIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+
+
+    def _prepare(self):
+        '''
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        '''
+        #
+        def _toMask(objs, coco):
+            # modify segmentation by reference
+            for obj in objs:
+                t = coco.imgs[obj['image_id']]
+                if type(obj['segmentation']) == list:
+                    if type(obj['segmentation'][0]) == dict:
+                        print 'debug'
+                    obj['segmentation'] = mask.frPyObjects(obj['segmentation'],t['height'],t['width'])
+                    if len(obj['segmentation']) == 1:
+                        obj['segmentation'] = obj['segmentation'][0]
+                    else:
+                        # an object can have multiple polygon regions
+                        # merge them into one RLE mask
+                        obj['segmentation'] = mask.merge(obj['segmentation'])
+                elif type(obj['segmentation']) == dict and type(obj['segmentation']['counts']) == list:
+                    obj['segmentation'] = mask.frPyObjects([obj['segmentation']],t['height'],t['width'])[0]
+                elif type(obj['segmentation']) == dict and \
+                     type(obj['segmentation']['counts'] == unicode or type(obj['segmentation']['counts']) == str):
+                    pass
+                else:
+                    raise Exception('segmentation format not supported.')
+        p = self.params
+        if p.useCats:
+            gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+            dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+        else:
+            gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        if p.useSegm:
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        self._gts = defaultdict(list)       # gt for evaluation
+        self._dts = defaultdict(list)       # dt for evaluation
+        for gt in gts:
+            self._gts[gt['image_id'], gt['category_id']].append(gt)
+        for dt in dts:
+            self._dts[dt['image_id'], dt['category_id']].append(dt)
+        self.evalImgs = defaultdict(list)   # per-image per-category evaluation results
+        self.eval     = {}                  # accumulated evaluation results
+
+    def evaluate(self):
+        '''
+        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+        :return: None
+        '''
+        tic = time.time()
+        print 'Running per image evaluation...      '
+        p = self.params
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params=p
+
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        computeIoU = self.computeIoU
+        self.ious = {(imgId, catId): computeIoU(imgId, catId) \
+                        for imgId in p.imgIds
+                        for catId in catIds}
+
+        evaluateImg = self.evaluateImg
+        maxDet = p.maxDets[-1]
+        self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet)
+                 for catId in catIds
+                 for areaRng in p.areaRng
+                 for imgId in p.imgIds
+             ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print 'DONE (t=%0.2fs).'%(toc-tic)
+
+    def computeIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId,catId]
+            dt = self._dts[imgId,catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
+        if len(gt) == 0 and len(dt) ==0:
+            return []
+        dt = sorted(dt, key=lambda x: -x['score'])
+        if len(dt) > p.maxDets[-1]:
+            dt=dt[0:p.maxDets[-1]]
+
+        if p.useSegm:
+            g = [g['segmentation'] for g in gt]
+            d = [d['segmentation'] for d in dt]
+        else:
+            g = [g['bbox'] for g in gt]
+            d = [d['bbox'] for d in dt]
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        ious = mask.iou(d,g,iscrowd)
+        return ious
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        '''
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        '''
+        #
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId,catId]
+            dt = self._dts[imgId,catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
+        if len(gt) == 0 and len(dt) ==0:
+            return None
+
+        for g in gt:
+            if 'ignore' not in g:
+                g['ignore'] = 0
+            if g['iscrowd'] == 1 or g['ignore'] or (g['area']<aRng[0] or g['area']>aRng[1]):
+                g['_ignore'] = 1
+            else:
+                g['_ignore'] = 0
+
+        # sort dt highest score first, sort gt ignore last
+        # gt = sorted(gt, key=lambda x: x['_ignore'])
+        gtind = [ind for (ind, g) in sorted(enumerate(gt), key=lambda (ind, g): g['_ignore']) ]
+
+        gt = [gt[ind] for ind in gtind]
+        dt = sorted(dt, key=lambda x: -x['score'])[0:maxDet]
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        # load computed ious
+        N_iou = len(self.ious[imgId, catId])
+        ious = self.ious[imgId, catId][0:maxDet, np.array(gtind)] if N_iou >0 else self.ious[imgId, catId]
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm  = np.zeros((T,G))
+        dtm  = np.zeros((T,D))
+        gtIg = np.array([g['_ignore'] for g in gt])
+        dtIg = np.zeros((T,D))
+        if not len(ious)==0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t,1-1e-10])
+                    m   = -1
+                    for gind, g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind,gind]>0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m>-1 and gtIg[m]==0 and gtIg[gind]==1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind,gind] < iou:
+                            continue
+                        # match successful and best so far, store appropriately
+                        iou=ious[dind,gind]
+                        m=gind
+                    # if match made store id of match for both dt and gt
+                    if m ==-1:
+                        continue
+                    dtIg[tind,dind] = gtIg[m]
+                    dtm[tind,dind]  = gt[m]['id']
+                    gtm[tind,m]     = d['id']
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d['area']<aRng[0] or d['area']>aRng[1] for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0)))
+        # store results for given image and category
+        return {
+                'image_id':     imgId,
+                'category_id':  catId,
+                'aRng':         aRng,
+                'maxDet':       maxDet,
+                'dtIds':        [d['id'] for d in dt],
+                'gtIds':        [g['id'] for g in gt],
+                'dtMatches':    dtm,
+                'gtMatches':    gtm,
+                'dtScores':     [d['score'] for d in dt],
+                'gtIgnore':     gtIg,
+                'dtIgnore':     dtIg,
+            }
+
+    def accumulate(self, p = None):
+        '''
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        '''
+        print 'Accumulating evaluation results...   '
+        tic = time.time()
+        if not self.evalImgs:
+            print 'Please run evaluate() first'
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T           = len(p.iouThrs)
+        R           = len(p.recThrs)
+        K           = len(p.catIds) if p.useCats else 1
+        A           = len(p.areaRng)
+        M           = len(p.maxDets)
+        precision   = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
+        recall      = -np.ones((T,K,A,M))
+
+        # create dictionary for future indexing
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.imgIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds)  if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
+        i_list = [n for n, i in enumerate(p.imgIds)  if i in setI]
+        # K0 = len(_pe.catIds)
+        I0 = len(_pe.imgIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0*A0*I0
+            for a, a0 in enumerate(a_list):
+                Na = a0*I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk+Na+i] for i in i_list]
+                    E = filter(None, E)
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
+
+                    # different sorting method generates slightly different results.
+                    # mergesort is used to be consistent as Matlab implementation.
+                    inds = np.argsort(-dtScores, kind='mergesort')
+
+                    dtm  = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
+                    dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet]  for e in E], axis=1)[:,inds]
+                    gtIg = np.concatenate([e['gtIgnore']  for e in E])
+                    npig = len([ig for ig in gtIg if ig == 0])
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(               dtm,  np.logical_not(dtIg) )
+                    fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
+
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp+tp+np.spacing(1))
+                        q  = np.zeros((R,))
+
+                        if nd:
+                            recall[t,k,a,m] = rc[-1]
+                        else:
+                            recall[t,k,a,m] = 0
+
+                        # numpy is slow without cython optimization for accessing elements
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist(); q = q.tolist()
+
+                        for i in range(nd-1, 0, -1):
+                            if pr[i] > pr[i-1]:
+                                pr[i-1] = pr[i]
+
+                        inds = np.searchsorted(rc, p.recThrs)
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                        except:
+                            pass
+                        precision[t,:,k,a,m] = np.array(q)
+        self.eval = {
+            'params': p,
+            'counts': [T, R, K, A, M],
+            'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            'precision': precision,
+            'recall':   recall,
+        }
+        toc = time.time()
+        print 'DONE (t=%0.2fs).'%( toc-tic )
+
+    def summarize(self):
+        '''
+        Compute and display summary metrics for evaluation results.
+        Note this functin can *only* be applied on the default parameter setting
+        '''
+        def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
+            p = self.params
+            iStr        = ' {:<18} {} @[ IoU={:<9} | area={:>6} | maxDets={:>3} ] = {}'
+            titleStr    = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr     = '(AP)' if ap==1 else '(AR)'
+            iouStr      = '%0.2f:%0.2f'%(p.iouThrs[0], p.iouThrs[-1]) if iouThr is None else '%0.2f'%(iouThr)
+            areaStr     = areaRng
+            maxDetsStr  = '%d'%(maxDets)
+
+            aind = [i for i, aRng in enumerate(['all', 'small', 'medium', 'large']) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate([1, 10, 100]) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval['precision']
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                # areaRng
+                s = s[:,:,:,aind,mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval['recall']
+                s = s[:,:,aind,mind]
+            if len(s[s>-1])==0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s>-1])
+            print iStr.format(titleStr, typeStr, iouStr, areaStr, maxDetsStr, '%.3f'%(float(mean_s)))
+            return mean_s
+
+        if not self.eval:
+            raise Exception('Please run accumulate() first')
+        self.stats = np.zeros((12,))
+        self.stats[0] = _summarize(1)
+        self.stats[1] = _summarize(1,iouThr=.5)
+        self.stats[2] = _summarize(1,iouThr=.75)
+        self.stats[3] = _summarize(1,areaRng='small')
+        self.stats[4] = _summarize(1,areaRng='medium')
+        self.stats[5] = _summarize(1,areaRng='large')
+        self.stats[6] = _summarize(0,maxDets=1)
+        self.stats[7] = _summarize(0,maxDets=10)
+        self.stats[8] = _summarize(0,maxDets=100)
+        self.stats[9]  = _summarize(0,areaRng='small')
+        self.stats[10] = _summarize(0,areaRng='medium')
+        self.stats[11] = _summarize(0,areaRng='large')
+
+    def __str__(self):
+        self.summarize()
+
+class Params:
+    '''
+    Params for coco evaluation api
+    '''
+    def __init__(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(.5, 0.95, np.round((0.95-.5)/.05)+1, endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, np.round((1.00-.0)/.01)+1, endpoint=True)
+        self.maxDets = [1,10,100]
+        self.areaRng = [ [0**2,1e5**2], [0**2, 32**2], [32**2, 96**2], [96**2, 1e5**2] ]
+        self.useSegm = 0
+        self.useCats = 1
\ No newline at end of file
diff --git a/examples/faster-rcnn/lib/pycocotools/license.txt b/examples/faster-rcnn/lib/pycocotools/license.txt
new file mode 100644
index 000000000..495c163a6
--- /dev/null
+++ b/examples/faster-rcnn/lib/pycocotools/license.txt
@@ -0,0 +1,26 @@
+Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met: 
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer. 
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution. 
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are those
+of the authors and should not be interpreted as representing official policies, 
+either expressed or implied, of the FreeBSD Project.
diff --git a/examples/faster-rcnn/lib/pycocotools/mask.py b/examples/faster-rcnn/lib/pycocotools/mask.py
new file mode 100644
index 000000000..673254674
--- /dev/null
+++ b/examples/faster-rcnn/lib/pycocotools/mask.py
@@ -0,0 +1,82 @@
+__author__ = 'tsungyi'
+
+import pycocotools._mask as _mask
+
+# Interface for manipulating masks stored in RLE format.
+#
+# RLE is a simple yet efficient format for storing binary masks. RLE
+# first divides a vector (or vectorized image) into a series of piecewise
+# constant regions and then for each piece simply stores the length of
+# that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would
+# be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1]
+# (note that the odd counts are always the numbers of zeros). Instead of
+# storing the counts directly, additional compression is achieved with a
+# variable bitrate representation based on a common scheme called LEB128.
+#
+# Compression is greatest given large piecewise constant regions.
+# Specifically, the size of the RLE is proportional to the number of
+# *boundaries* in M (or for an image the number of boundaries in the y
+# direction). Assuming fairly simple shapes, the RLE representation is
+# O(sqrt(n)) where n is number of pixels in the object. Hence space usage
+# is substantially lower, especially for large simple objects (large n).
+#
+# Many common operations on masks can be computed directly using the RLE
+# (without need for decoding). This includes computations such as area,
+# union, intersection, etc. All of these operations are linear in the
+# size of the RLE, in other words they are O(sqrt(n)) where n is the area
+# of the object. Computing these operations on the original mask is O(n).
+# Thus, using the RLE can result in substantial computational savings.
+#
+# The following API functions are defined:
+#  encode         - Encode binary masks using RLE.
+#  decode         - Decode binary masks encoded via RLE.
+#  merge          - Compute union or intersection of encoded masks.
+#  iou            - Compute intersection over union between masks.
+#  area           - Compute area of encoded masks.
+#  toBbox         - Get bounding boxes surrounding encoded masks.
+#  frPyObjects    - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask.
+#
+# Usage:
+#  Rs     = encode( masks )
+#  masks  = decode( Rs )
+#  R      = merge( Rs, intersect=false )
+#  o      = iou( dt, gt, iscrowd )
+#  a      = area( Rs )
+#  bbs    = toBbox( Rs )
+#  Rs     = frPyObjects( [pyObjects], h, w )
+#
+# In the API the following formats are used:
+#  Rs      - [dict] Run-length encoding of binary masks
+#  R       - dict Run-length encoding of binary mask
+#  masks   - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order)
+#  iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore
+#  bbs     - [nx4] Bounding box(es) stored as [x y w h]
+#  poly    - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list)
+#  dt,gt   - May be either bounding boxes or encoded masks
+# Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel).
+#
+# Finally, a note about the intersection over union (iou) computation.
+# The standard iou of a ground truth (gt) and detected (dt) object is
+#  iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt))
+# For "crowd" regions, we use a modified criteria. If a gt object is
+# marked as "iscrowd", we allow a dt to match any subregion of the gt.
+# Choosing gt' in the crowd gt that best matches the dt can be done using
+# gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing
+#  iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt)
+# For crowd gt regions we use this modified criteria above for the iou.
+#
+# To compile run "python setup.py build_ext --inplace"
+# Please do not contact us for help with compiling.
+#
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+# Licensed under the Simplified BSD License [see coco/license.txt]
+
+encode      = _mask.encode
+decode      = _mask.decode
+iou         = _mask.iou
+merge       = _mask.merge
+area        = _mask.area
+toBbox      = _mask.toBbox
+frPyObjects = _mask.frPyObjects
\ No newline at end of file
diff --git a/examples/faster-rcnn/lib/pycocotools/maskApi.c b/examples/faster-rcnn/lib/pycocotools/maskApi.c
new file mode 100644
index 000000000..2b2d89116
--- /dev/null
+++ b/examples/faster-rcnn/lib/pycocotools/maskApi.c
@@ -0,0 +1,208 @@
+/**************************************************************************
+* Microsoft COCO Toolbox.      version 2.0
+* Data, paper, and tutorials available at:  http://mscoco.org/
+* Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+* Licensed under the Simplified BSD License [see coco/license.txt]
+**************************************************************************/
+#include "maskApi.h"
+#include <math.h>
+#include <stdlib.h>
+
+uint umin( uint a, uint b ) { return (a<b) ? a : b; }
+uint umax( uint a, uint b ) { return (a>b) ? a : b; }
+
+void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) {
+  R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m);
+  if(cnts) for(siz j=0; j<m; j++) R->cnts[j]=cnts[j];
+}
+
+void rleFree( RLE *R ) {
+  free(R->cnts); R->cnts=0;
+}
+
+void rlesInit( RLE **R, siz n ) {
+  *R = (RLE*) malloc(sizeof(RLE)*n);
+  for(siz i=0; i<n; i++) rleInit((*R)+i,0,0,0,0);
+}
+
+void rlesFree( RLE **R, siz n ) {
+  for(siz i=0; i<n; i++) rleFree((*R)+i); free(*R); *R=0;
+}
+
+void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) {
+  siz i, j, k, a=w*h; uint c, *cnts; byte p;
+  cnts = malloc(sizeof(uint)*(a+1));
+  for(i=0; i<n; i++) {
+    const byte *T=M+a*i; k=0; p=0; c=0;
+    for(j=0; j<a; j++) { if(T[j]!=p) { cnts[k++]=c; c=0; p=T[j]; } c++; }
+    cnts[k++]=c; rleInit(R+i,h,w,k,cnts);
+  }
+  free(cnts);
+}
+
+void rleDecode( const RLE *R, byte *M, siz n ) {
+  for( siz i=0; i<n; i++ ) {
+    byte v=0; for( siz j=0; j<R[i].m; j++ ) {
+      for( siz k=0; k<R[i].cnts[j]; k++ ) *(M++)=v; v=!v; }}
+}
+
+void rleMerge( const RLE *R, RLE *M, siz n, bool intersect ) {
+  uint *cnts, c, ca, cb, cc, ct; bool v, va, vb, vp;
+  siz i, a, b, h=R[0].h, w=R[0].w, m=R[0].m; RLE A, B;
+  if(n==0) { rleInit(M,0,0,0,0); return; }
+  if(n==1) { rleInit(M,h,w,m,R[0].cnts); return; }
+  cnts = malloc(sizeof(uint)*(h*w+1));
+  for( a=0; a<m; a++ ) cnts[a]=R[0].cnts[a];
+  for( i=1; i<n; i++ ) {
+    B=R[i]; if(B.h!=h||B.w!=w) { h=w=m=0; break; }
+    rleInit(&A,h,w,m,cnts); ca=A.cnts[0]; cb=B.cnts[0];
+    v=va=vb=0; m=0; a=b=1; cc=0; ct=1;
+    while( ct>0 ) {
+      c=umin(ca,cb); cc+=c; ct=0;
+      ca-=c; if(!ca && a<A.m) { ca=A.cnts[a++]; va=!va; } ct+=ca;
+      cb-=c; if(!cb && b<B.m) { cb=B.cnts[b++]; vb=!vb; } ct+=cb;
+      vp=v; if(intersect) v=va&&vb; else v=va||vb;
+      if( v!=vp||ct==0 ) { cnts[m++]=cc; cc=0; }
+    }
+    rleFree(&A);
+  }
+  rleInit(M,h,w,m,cnts); free(cnts);
+}
+
+void rleArea( const RLE *R, siz n, uint *a ) {
+  for( siz i=0; i<n; i++ ) {
+    a[i]=0; for( siz j=1; j<R[i].m; j+=2 ) a[i]+=R[i].cnts[j]; }
+}
+
+void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) {
+  siz g, d; BB db, gb; bool crowd;
+  db=malloc(sizeof(double)*m*4); rleToBbox(dt,db,m);
+  gb=malloc(sizeof(double)*n*4); rleToBbox(gt,gb,n);
+  bbIou(db,gb,m,n,iscrowd,o); free(db); free(gb);
+  for( g=0; g<n; g++ ) for( d=0; d<m; d++ ) if(o[g*m+d]>0) {
+    crowd=iscrowd!=NULL && iscrowd[g];
+    if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; }
+    siz ka, kb, a, b; uint c, ca, cb, ct, i, u; bool va, vb;
+    ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0;
+    cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1;
+    while( ct>0 ) {
+      c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0;
+      ca-=c; if(!ca && a<ka) { ca=dt[d].cnts[a++]; va=!va; } ct+=ca;
+      cb-=c; if(!cb && b<kb) { cb=gt[g].cnts[b++]; vb=!vb; } ct+=cb;
+    }
+    if(i==0) u=1; else if(crowd) rleArea(dt+d,1,&u);
+    o[g*m+d] = (double)i/(double)u;
+  }
+}
+
+void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) {
+  double h, w, i, u, ga, da; siz g, d; bool crowd;
+  for( g=0; g<n; g++ ) {
+    BB G=gt+g*4; ga=G[2]*G[3]; crowd=iscrowd!=NULL && iscrowd[g];
+    for( d=0; d<m; d++ ) {
+      BB D=dt+d*4; da=D[2]*D[3]; o[g*m+d]=0;
+      w=fmin(D[2]+D[0],G[2]+G[0])-fmax(D[0],G[0]); if(w<=0) continue;
+      h=fmin(D[3]+D[1],G[3]+G[1])-fmax(D[1],G[1]); if(h<=0) continue;
+      i=w*h; u = crowd ? da : da+ga-i; o[g*m+d]=i/u;
+    }
+  }
+}
+
+void rleToBbox( const RLE *R, BB bb, siz n ) {
+  for( siz i=0; i<n; i++ ) {
+    uint h, w, x, y, xs, ys, xe, ye, cc, t; siz j, m;
+    h=(uint)R[i].h; w=(uint)R[i].w; m=R[i].m;
+    m=((siz)(m/2))*2; xs=w; ys=h; xe=ye=0; cc=0;
+    if(m==0) { bb[4*i+0]=bb[4*i+1]=bb[4*i+2]=bb[4*i+3]=0; continue; }
+    for( j=0; j<m; j++ ) {
+      cc+=R[i].cnts[j]; t=cc-j%2; y=t%h; x=(t-y)/h;
+      xs=umin(xs,x); xe=umax(xe,x); ys=umin(ys,y); ye=umax(ye,y);
+    }
+    bb[4*i+0]=xs; bb[4*i+2]=xe-xs+1;
+    bb[4*i+1]=ys; bb[4*i+3]=ye-ys+1;
+  }
+}
+
+void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) {
+  for( siz i=0; i<n; i++ ) {
+    double xs=bb[4*i+0], xe=xs+bb[4*i+2];
+    double ys=bb[4*i+1], ye=ys+bb[4*i+3];
+    double xy[8] = {xs,ys,xs,ye,xe,ye,xe,ys};
+    rleFrPoly( R+i, xy, 4, h, w );
+  }
+}
+
+int uintCompare(const void *a, const void *b) {
+  uint c=*((uint*)a), d=*((uint*)b); return c>d?1:c<d?-1:0;
+}
+
+void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) {
+  // upsample and get discrete points densely along entire boundary
+  siz j, m=0; double scale=5; int *x, *y, *u, *v; uint *a, *b;
+  x=malloc(sizeof(int)*(k+1)); y=malloc(sizeof(int)*(k+1));
+  for(j=0; j<k; j++) x[j]=(int)(scale*xy[j*2+0]+.5); x[k]=x[0];
+  for(j=0; j<k; j++) y[j]=(int)(scale*xy[j*2+1]+.5); y[k]=y[0];
+  for(j=0; j<k; j++) m+=umax(abs(x[j]-x[j+1]),abs(y[j]-y[j+1]))+1;
+  u=malloc(sizeof(int)*m); v=malloc(sizeof(int)*m); m=0;
+  for( j=0; j<k; j++ ) {
+    int xs=x[j], xe=x[j+1], ys=y[j], ye=y[j+1], dx, dy, t;
+    bool flip; double s; dx=abs(xe-xs); dy=abs(ys-ye);
+    flip = (dx>=dy && xs>xe) || (dx<dy && ys>ye);
+    if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; }
+    s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy;
+    if(dx>=dy) for( int d=0; d<=dx; d++ ) {
+      t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++;
+    } else for( int d=0; d<=dy; d++ ) {
+      t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++;
+    }
+  }
+  // get points along y-boundary and downsample
+  free(x); free(y); k=m; m=0; double xd, yd;
+  x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k);
+  for( j=1; j<k; j++ ) if(u[j]!=u[j-1]) {
+    xd=(double)(u[j]<u[j-1]?u[j]:u[j]-1); xd=(xd+.5)/scale-.5;
+    if( floor(xd)!=xd || xd<0 || xd>w-1 ) continue;
+    yd=(double)(v[j]<v[j-1]?v[j]:v[j-1]); yd=(yd+.5)/scale-.5;
+    if(yd<0) yd=0; else if(yd>h) yd=h; yd=ceil(yd);
+    x[m]=(int) xd; y[m]=(int) yd; m++;
+  }
+  // compute rle encoding given y-boundary points
+  k=m; a=malloc(sizeof(uint)*(k+1));
+  for( j=0; j<k; j++ ) a[j]=(uint)(x[j]*(int)(h)+y[j]);
+  a[k++]=(uint)(h*w); free(u); free(v); free(x); free(y);
+  qsort(a,k,sizeof(uint),uintCompare); uint p=0;
+  for( j=0; j<k; j++ ) { uint t=a[j]; a[j]-=p; p=t; }
+  b=malloc(sizeof(uint)*k); j=m=0; b[m++]=a[j++];
+  while(j<k) if(a[j]>0) b[m++]=a[j++]; else {
+    j++; if(j<k) b[m-1]+=a[j++]; }
+  rleInit(R,h,w,m,b); free(a); free(b);
+}
+
+char* rleToString( const RLE *R ) {
+  // Similar to LEB128 but using 6 bits/char and ascii chars 48-111.
+  siz i, m=R->m, p=0; long x; bool more;
+  char *s=malloc(sizeof(char)*m*6);
+  for( i=0; i<m; i++ ) {
+    x=(long) R->cnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1;
+    while( more ) {
+      char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0;
+      if(more) c |= 0x20; c+=48; s[p++]=c;
+    }
+  }
+  s[p]=0; return s;
+}
+
+void rleFrString( RLE *R, char *s, siz h, siz w ) {
+  siz m=0, p=0, k; long x; bool more; uint *cnts;
+  while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0;
+  while( s[p] ) {
+    x=0; k=0; more=1;
+    while( more ) {
+      char c=s[p]-48; x |= (c & 0x1f) << 5*k;
+      more = c & 0x20; p++; k++;
+      if(!more && (c & 0x10)) x |= -1 << 5*k;
+    }
+    if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x;
+  }
+  rleInit(R,h,w,m,cnts); free(cnts);
+}
diff --git a/examples/faster-rcnn/lib/pycocotools/maskApi.h b/examples/faster-rcnn/lib/pycocotools/maskApi.h
new file mode 100644
index 000000000..ff16116c4
--- /dev/null
+++ b/examples/faster-rcnn/lib/pycocotools/maskApi.h
@@ -0,0 +1,55 @@
+/**************************************************************************
+* Microsoft COCO Toolbox.      version 2.0
+* Data, paper, and tutorials available at:  http://mscoco.org/
+* Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+* Licensed under the Simplified BSD License [see coco/license.txt]
+**************************************************************************/
+#pragma once
+#include <stdbool.h>
+
+typedef unsigned int uint;
+typedef unsigned long siz;
+typedef unsigned char byte;
+typedef double* BB;
+typedef struct { siz h, w, m; uint *cnts; } RLE;
+
+// Initialize/destroy RLE.
+void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts );
+void rleFree( RLE *R );
+
+// Initialize/destroy RLE array.
+void rlesInit( RLE **R, siz n );
+void rlesFree( RLE **R, siz n );
+
+// Encode binary masks using RLE.
+void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n );
+
+// Decode binary masks encoded via RLE.
+void rleDecode( const RLE *R, byte *mask, siz n );
+
+// Compute union or intersection of encoded masks.
+void rleMerge( const RLE *R, RLE *M, siz n, bool intersect );
+
+// Compute area of encoded masks.
+void rleArea( const RLE *R, siz n, uint *a );
+
+// Compute intersection over union between masks.
+void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o );
+
+// Compute intersection over union between bounding boxes.
+void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o );
+
+// Get bounding boxes surrounding encoded masks.
+void rleToBbox( const RLE *R, BB bb, siz n );
+
+// Convert bounding boxes to encoded masks.
+void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n );
+
+// Convert polygon to encoded mask.
+void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w );
+
+// Get compressed string representation of encoded mask.
+char* rleToString( const RLE *R );
+
+// Convert from compressed string representation of encoded mask.
+void rleFrString( RLE *R, char *s, siz h, siz w );
diff --git a/examples/faster-rcnn/lib/roi_data_layer/__init__.py b/examples/faster-rcnn/lib/roi_data_layer/__init__.py
new file mode 100644
index 000000000..7ba6a65c5
--- /dev/null
+++ b/examples/faster-rcnn/lib/roi_data_layer/__init__.py
@@ -0,0 +1,6 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
diff --git a/examples/faster-rcnn/lib/roi_data_layer/layer.py b/examples/faster-rcnn/lib/roi_data_layer/layer.py
new file mode 100644
index 000000000..04f41728b
--- /dev/null
+++ b/examples/faster-rcnn/lib/roi_data_layer/layer.py
@@ -0,0 +1,196 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""The data layer used during training to train a Fast R-CNN network.
+
+RoIDataLayer implements a Caffe Python layer.
+"""
+
+import caffe
+from fast_rcnn.config import cfg
+from roi_data_layer.minibatch import get_minibatch
+import numpy as np
+import yaml
+from multiprocessing import Process, Queue
+
+class RoIDataLayer(caffe.Layer):
+    """Fast R-CNN data layer used for training."""
+
+    def _shuffle_roidb_inds(self):
+        """Randomly permute the training roidb."""
+        if cfg.TRAIN.ASPECT_GROUPING:
+            widths = np.array([r['width'] for r in self._roidb])
+            heights = np.array([r['height'] for r in self._roidb])
+            horz = (widths >= heights)
+            vert = np.logical_not(horz)
+            horz_inds = np.where(horz)[0]
+            vert_inds = np.where(vert)[0]
+            inds = np.hstack((
+                np.random.permutation(horz_inds),
+                np.random.permutation(vert_inds)))
+            inds = np.reshape(inds, (-1, 2))
+            row_perm = np.random.permutation(np.arange(inds.shape[0]))
+            inds = np.reshape(inds[row_perm, :], (-1,))
+            self._perm = inds
+        else:
+            self._perm = np.random.permutation(np.arange(len(self._roidb)))
+        self._cur = 0
+
+    def _get_next_minibatch_inds(self):
+        """Return the roidb indices for the next minibatch."""
+        if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
+            self._shuffle_roidb_inds()
+
+        db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH]
+        self._cur += cfg.TRAIN.IMS_PER_BATCH
+        return db_inds
+
+    def _get_next_minibatch(self):
+        """Return the blobs to be used for the next minibatch.
+
+        If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
+        separate process and made available through self._blob_queue.
+        """
+        if cfg.TRAIN.USE_PREFETCH:
+            return self._blob_queue.get()
+        else:
+            db_inds = self._get_next_minibatch_inds()
+            minibatch_db = [self._roidb[i] for i in db_inds]
+            return get_minibatch(minibatch_db, self._num_classes)
+
+    def set_roidb(self, roidb):
+        """Set the roidb to be used by this layer during training."""
+        self._roidb = roidb
+        self._shuffle_roidb_inds()
+        if cfg.TRAIN.USE_PREFETCH:
+            self._blob_queue = Queue(10)
+            self._prefetch_process = BlobFetcher(self._blob_queue,
+                                                 self._roidb,
+                                                 self._num_classes)
+            self._prefetch_process.start()
+            # Terminate the child process when the parent exists
+            def cleanup():
+                print 'Terminating BlobFetcher'
+                self._prefetch_process.terminate()
+                self._prefetch_process.join()
+            import atexit
+            atexit.register(cleanup)
+
+    def setup(self, bottom, top):
+        """Setup the RoIDataLayer."""
+
+        # parse the layer parameter string, which must be valid YAML
+        layer_params = yaml.load(self.param_str)
+
+        self._num_classes = layer_params['num_classes']
+
+        self._name_to_top_map = {}
+
+        # data blob: holds a batch of N images, each with 3 channels
+        idx = 0
+        top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3,
+            max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE)
+        self._name_to_top_map['data'] = idx
+        idx += 1
+
+        if cfg.TRAIN.HAS_RPN:
+            top[idx].reshape(1, 3)
+            self._name_to_top_map['im_info'] = idx
+            idx += 1
+
+            top[idx].reshape(1, 4)
+            self._name_to_top_map['gt_boxes'] = idx
+            idx += 1
+        else: # not using RPN
+            # rois blob: holds R regions of interest, each is a 5-tuple
+            # (n, x1, y1, x2, y2) specifying an image batch index n and a
+            # rectangle (x1, y1, x2, y2)
+            top[idx].reshape(1, 5)
+            self._name_to_top_map['rois'] = idx
+            idx += 1
+
+            # labels blob: R categorical labels in [0, ..., K] for K foreground
+            # classes plus background
+            top[idx].reshape(1)
+            self._name_to_top_map['labels'] = idx
+            idx += 1
+
+            if cfg.TRAIN.BBOX_REG:
+                # bbox_targets blob: R bounding-box regression targets with 4
+                # targets per class
+                top[idx].reshape(1, self._num_classes * 4)
+                self._name_to_top_map['bbox_targets'] = idx
+                idx += 1
+
+                # bbox_inside_weights blob: At most 4 targets per roi are active;
+                # thisbinary vector sepcifies the subset of active targets
+                top[idx].reshape(1, self._num_classes * 4)
+                self._name_to_top_map['bbox_inside_weights'] = idx
+                idx += 1
+
+                top[idx].reshape(1, self._num_classes * 4)
+                self._name_to_top_map['bbox_outside_weights'] = idx
+                idx += 1
+
+        print 'RoiDataLayer: name_to_top:', self._name_to_top_map
+        assert len(top) == len(self._name_to_top_map)
+
+    def forward(self, bottom, top):
+        """Get blobs and copy them into this layer's top blob vector."""
+        blobs = self._get_next_minibatch()
+
+        for blob_name, blob in blobs.iteritems():
+            top_ind = self._name_to_top_map[blob_name]
+            # Reshape net's input blobs
+            top[top_ind].reshape(*(blob.shape))
+            # Copy data into net's input blobs
+            top[top_ind].data[...] = blob.astype(np.float32, copy=False)
+
+    def backward(self, top, propagate_down, bottom):
+        """This layer does not propagate gradients."""
+        pass
+
+    def reshape(self, bottom, top):
+        """Reshaping happens during the call to forward."""
+        pass
+
+class BlobFetcher(Process):
+    """Experimental class for prefetching blobs in a separate process."""
+    def __init__(self, queue, roidb, num_classes):
+        super(BlobFetcher, self).__init__()
+        self._queue = queue
+        self._roidb = roidb
+        self._num_classes = num_classes
+        self._perm = None
+        self._cur = 0
+        self._shuffle_roidb_inds()
+        # fix the random seed for reproducibility
+        np.random.seed(cfg.RNG_SEED)
+
+    def _shuffle_roidb_inds(self):
+        """Randomly permute the training roidb."""
+        # TODO(rbg): remove duplicated code
+        self._perm = np.random.permutation(np.arange(len(self._roidb)))
+        self._cur = 0
+
+    def _get_next_minibatch_inds(self):
+        """Return the roidb indices for the next minibatch."""
+        # TODO(rbg): remove duplicated code
+        if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
+            self._shuffle_roidb_inds()
+
+        db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH]
+        self._cur += cfg.TRAIN.IMS_PER_BATCH
+        return db_inds
+
+    def run(self):
+        print 'BlobFetcher started'
+        while True:
+            db_inds = self._get_next_minibatch_inds()
+            minibatch_db = [self._roidb[i] for i in db_inds]
+            blobs = get_minibatch(minibatch_db, self._num_classes)
+            self._queue.put(blobs)
diff --git a/examples/faster-rcnn/lib/roi_data_layer/minibatch.py b/examples/faster-rcnn/lib/roi_data_layer/minibatch.py
new file mode 100644
index 000000000..f4535b022
--- /dev/null
+++ b/examples/faster-rcnn/lib/roi_data_layer/minibatch.py
@@ -0,0 +1,199 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Compute minibatch blobs for training a Fast R-CNN network."""
+
+import numpy as np
+import numpy.random as npr
+import cv2
+from fast_rcnn.config import cfg
+from utils.blob import prep_im_for_blob, im_list_to_blob
+
+def get_minibatch(roidb, num_classes):
+    """Given a roidb, construct a minibatch sampled from it."""
+    num_images = len(roidb)
+    # Sample random scales to use for each image in this batch
+    random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
+                                    size=num_images)
+    assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \
+        'num_images ({}) must divide BATCH_SIZE ({})'. \
+        format(num_images, cfg.TRAIN.BATCH_SIZE)
+    rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
+    fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
+
+    # Get the input image blob, formatted for caffe
+    im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)
+
+    blobs = {'data': im_blob}
+
+    if cfg.TRAIN.HAS_RPN:
+        assert len(im_scales) == 1, "Single batch only"
+        assert len(roidb) == 1, "Single batch only"
+        # gt boxes: (x1, y1, x2, y2, cls)
+        gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
+        gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
+        gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
+        gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
+        blobs['gt_boxes'] = gt_boxes
+        blobs['im_info'] = np.array(
+            [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]],
+            dtype=np.float32)
+    else: # not using RPN
+        # Now, build the region of interest and label blobs
+        rois_blob = np.zeros((0, 5), dtype=np.float32)
+        labels_blob = np.zeros((0), dtype=np.float32)
+        bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32)
+        bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32)
+        # all_overlaps = []
+        for im_i in xrange(num_images):
+            labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \
+                = _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image,
+                               num_classes)
+
+            # Add to RoIs blob
+            rois = _project_im_rois(im_rois, im_scales[im_i])
+            batch_ind = im_i * np.ones((rois.shape[0], 1))
+            rois_blob_this_image = np.hstack((batch_ind, rois))
+            rois_blob = np.vstack((rois_blob, rois_blob_this_image))
+
+            # Add to labels, bbox targets, and bbox loss blobs
+            labels_blob = np.hstack((labels_blob, labels))
+            bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets))
+            bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights))
+            # all_overlaps = np.hstack((all_overlaps, overlaps))
+
+        # For debug visualizations
+        # _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps)
+
+        blobs['rois'] = rois_blob
+        blobs['labels'] = labels_blob
+
+        if cfg.TRAIN.BBOX_REG:
+            blobs['bbox_targets'] = bbox_targets_blob
+            blobs['bbox_inside_weights'] = bbox_inside_blob
+            blobs['bbox_outside_weights'] = \
+                np.array(bbox_inside_blob > 0).astype(np.float32)
+
+    return blobs
+
+def _sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes):
+    """Generate a random sample of RoIs comprising foreground and background
+    examples.
+    """
+    # label = class RoI has max overlap with
+    labels = roidb['max_classes']
+    overlaps = roidb['max_overlaps']
+    rois = roidb['boxes']
+
+    # Select foreground RoIs as those with >= FG_THRESH overlap
+    fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
+    # Guard against the case when an image has fewer than fg_rois_per_image
+    # foreground RoIs
+    fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size)
+    # Sample foreground regions without replacement
+    if fg_inds.size > 0:
+        fg_inds = npr.choice(
+                fg_inds, size=fg_rois_per_this_image, replace=False)
+
+    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
+    bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
+                       (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
+    # Compute number of background RoIs to take from this image (guarding
+    # against there being fewer than desired)
+    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
+    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
+                                        bg_inds.size)
+    # Sample foreground regions without replacement
+    if bg_inds.size > 0:
+        bg_inds = npr.choice(
+                bg_inds, size=bg_rois_per_this_image, replace=False)
+
+    # The indices that we're selecting (both fg and bg)
+    keep_inds = np.append(fg_inds, bg_inds)
+    # Select sampled values from various arrays:
+    labels = labels[keep_inds]
+    # Clamp labels for the background RoIs to 0
+    labels[fg_rois_per_this_image:] = 0
+    overlaps = overlaps[keep_inds]
+    rois = rois[keep_inds]
+
+    bbox_targets, bbox_inside_weights = _get_bbox_regression_labels(
+            roidb['bbox_targets'][keep_inds, :], num_classes)
+
+    return labels, overlaps, rois, bbox_targets, bbox_inside_weights
+
+def _get_image_blob(roidb, scale_inds):
+    """Builds an input blob from the images in the roidb at the specified
+    scales.
+    """
+    num_images = len(roidb)
+    processed_ims = []
+    im_scales = []
+    for i in xrange(num_images):
+        im = cv2.imread(roidb[i]['image'])
+        if roidb[i]['flipped']:
+            im = im[:, ::-1, :]
+        target_size = cfg.TRAIN.SCALES[scale_inds[i]]
+        im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
+                                        cfg.TRAIN.MAX_SIZE)
+        im_scales.append(im_scale)
+        processed_ims.append(im)
+
+    # Create a blob to hold the input images
+    blob = im_list_to_blob(processed_ims)
+
+    return blob, im_scales
+
+def _project_im_rois(im_rois, im_scale_factor):
+    """Project image RoIs into the rescaled training image."""
+    rois = im_rois * im_scale_factor
+    return rois
+
+def _get_bbox_regression_labels(bbox_target_data, num_classes):
+    """Bounding-box regression targets are stored in a compact form in the
+    roidb.
+
+    This function expands those targets into the 4-of-4*K representation used
+    by the network (i.e. only one class has non-zero targets). The loss weights
+    are similarly expanded.
+
+    Returns:
+        bbox_target_data (ndarray): N x 4K blob of regression targets
+        bbox_inside_weights (ndarray): N x 4K blob of loss weights
+    """
+    clss = bbox_target_data[:, 0]
+    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
+    bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
+    inds = np.where(clss > 0)[0]
+    for ind in inds:
+        cls = clss[ind]
+        start = 4 * cls
+        end = start + 4
+        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
+        bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
+    return bbox_targets, bbox_inside_weights
+
+def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps):
+    """Visualize a mini-batch for debugging."""
+    import matplotlib.pyplot as plt
+    for i in xrange(rois_blob.shape[0]):
+        rois = rois_blob[i, :]
+        im_ind = rois[0]
+        roi = rois[1:]
+        im = im_blob[im_ind, :, :, :].transpose((1, 2, 0)).copy()
+        im += cfg.PIXEL_MEANS
+        im = im[:, :, (2, 1, 0)]
+        im = im.astype(np.uint8)
+        cls = labels_blob[i]
+        plt.imshow(im)
+        print 'class: ', cls, ' overlap: ', overlaps[i]
+        plt.gca().add_patch(
+            plt.Rectangle((roi[0], roi[1]), roi[2] - roi[0],
+                          roi[3] - roi[1], fill=False,
+                          edgecolor='r', linewidth=3)
+            )
+        plt.show()
diff --git a/examples/faster-rcnn/lib/roi_data_layer/roidb.py b/examples/faster-rcnn/lib/roi_data_layer/roidb.py
new file mode 100644
index 000000000..97a6a7612
--- /dev/null
+++ b/examples/faster-rcnn/lib/roi_data_layer/roidb.py
@@ -0,0 +1,133 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Transform a roidb into a trainable roidb by adding a bunch of metadata."""
+
+import numpy as np
+from fast_rcnn.config import cfg
+from fast_rcnn.bbox_transform import bbox_transform
+from utils.cython_bbox import bbox_overlaps
+import PIL
+
+def prepare_roidb(imdb):
+    """Enrich the imdb's roidb by adding some derived quantities that
+    are useful for training. This function precomputes the maximum
+    overlap, taken over ground-truth boxes, between each ROI and
+    each ground-truth box. The class with maximum overlap is also
+    recorded.
+    """
+    sizes = [PIL.Image.open(imdb.image_path_at(i)).size
+             for i in xrange(imdb.num_images)]
+    roidb = imdb.roidb
+    for i in xrange(len(imdb.image_index)):
+        roidb[i]['image'] = imdb.image_path_at(i)
+        roidb[i]['width'] = sizes[i][0]
+        roidb[i]['height'] = sizes[i][1]
+        # need gt_overlaps as a dense array for argmax
+        gt_overlaps = roidb[i]['gt_overlaps'].toarray()
+        # max overlap with gt over classes (columns)
+        max_overlaps = gt_overlaps.max(axis=1)
+        # gt class that had the max overlap
+        max_classes = gt_overlaps.argmax(axis=1)
+        roidb[i]['max_classes'] = max_classes
+        roidb[i]['max_overlaps'] = max_overlaps
+        # sanity checks
+        # max overlap of 0 => class should be zero (background)
+        zero_inds = np.where(max_overlaps == 0)[0]
+        assert all(max_classes[zero_inds] == 0)
+        # max overlap > 0 => class should not be zero (must be a fg class)
+        nonzero_inds = np.where(max_overlaps > 0)[0]
+        assert all(max_classes[nonzero_inds] != 0)
+
+def add_bbox_regression_targets(roidb):
+    """Add information needed to train bounding-box regressors."""
+    assert len(roidb) > 0
+    assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'
+
+    num_images = len(roidb)
+    # Infer number of classes from the number of columns in gt_overlaps
+    num_classes = roidb[0]['gt_overlaps'].shape[1]
+    for im_i in xrange(num_images):
+        rois = roidb[im_i]['boxes']
+        max_overlaps = roidb[im_i]['max_overlaps']
+        max_classes = roidb[im_i]['max_classes']
+        roidb[im_i]['bbox_targets'] = \
+                _compute_targets(rois, max_overlaps, max_classes)
+
+    if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
+        # Use fixed / precomputed "means" and "stds" instead of empirical values
+        means = np.tile(
+                np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
+        stds = np.tile(
+                np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
+    else:
+        # Compute values needed for means and stds
+        # var(x) = E(x^2) - E(x)^2
+        class_counts = np.zeros((num_classes, 1)) + cfg.EPS
+        sums = np.zeros((num_classes, 4))
+        squared_sums = np.zeros((num_classes, 4))
+        for im_i in xrange(num_images):
+            targets = roidb[im_i]['bbox_targets']
+            for cls in xrange(1, num_classes):
+                cls_inds = np.where(targets[:, 0] == cls)[0]
+                if cls_inds.size > 0:
+                    class_counts[cls] += cls_inds.size
+                    sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
+                    squared_sums[cls, :] += \
+                            (targets[cls_inds, 1:] ** 2).sum(axis=0)
+
+        means = sums / class_counts
+        stds = np.sqrt(squared_sums / class_counts - means ** 2)
+
+    print 'bbox target means:'
+    print means
+    print means[1:, :].mean(axis=0) # ignore bg class
+    print 'bbox target stdevs:'
+    print stds
+    print stds[1:, :].mean(axis=0) # ignore bg class
+
+    # Normalize targets
+    if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
+        print "Normalizing targets"
+        for im_i in xrange(num_images):
+            targets = roidb[im_i]['bbox_targets']
+            for cls in xrange(1, num_classes):
+                cls_inds = np.where(targets[:, 0] == cls)[0]
+                roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
+                roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
+    else:
+        print "NOT normalizing targets"
+
+    # These values will be needed for making predictions
+    # (the predicts will need to be unnormalized and uncentered)
+    return means.ravel(), stds.ravel()
+
+def _compute_targets(rois, overlaps, labels):
+    """Compute bounding-box regression targets for an image."""
+    # Indices of ground-truth ROIs
+    gt_inds = np.where(overlaps == 1)[0]
+    if len(gt_inds) == 0:
+        # Bail if the image has no ground-truth ROIs
+        return np.zeros((rois.shape[0], 5), dtype=np.float32)
+    # Indices of examples for which we try to make predictions
+    ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]
+
+    # Get IoU overlap between each ex ROI and gt ROI
+    ex_gt_overlaps = bbox_overlaps(
+        np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
+        np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
+
+    # Find which gt ROI each ex ROI has max overlap with:
+    # this will be the ex ROI's gt target
+    gt_assignment = ex_gt_overlaps.argmax(axis=1)
+    gt_rois = rois[gt_inds[gt_assignment], :]
+    ex_rois = rois[ex_inds, :]
+
+    targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
+    targets[ex_inds, 0] = labels[ex_inds]
+    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
+    return targets
diff --git a/examples/faster-rcnn/lib/rpn/README.md b/examples/faster-rcnn/lib/rpn/README.md
new file mode 100644
index 000000000..80abf48e2
--- /dev/null
+++ b/examples/faster-rcnn/lib/rpn/README.md
@@ -0,0 +1,23 @@
+### `rpn` module overview
+
+##### `generate_anchors.py`
+
+Generates a regular grid of multi-scale, multi-aspect anchor boxes.
+
+##### `proposal_layer.py`
+
+Converts RPN outputs (per-anchor scores and bbox regression estimates) into object proposals.
+
+##### `anchor_target_layer.py` 
+
+Generates training targets/labels for each anchor. Classification labels are 1 (object), 0 (not object) or -1 (ignore).
+Bbox regression targets are specified when the classification label is > 0.
+
+##### `proposal_target_layer.py`
+
+Generates training targets/labels for each object proposal: classification labels 0 - K (bg or object class 1, ... , K)
+and bbox regression targets in that case that the label is > 0.
+
+##### `generate.py`
+
+Generate object detection proposals from an imdb using an RPN.
diff --git a/examples/faster-rcnn/lib/rpn/__init__.py b/examples/faster-rcnn/lib/rpn/__init__.py
new file mode 100644
index 000000000..23b31b2da
--- /dev/null
+++ b/examples/faster-rcnn/lib/rpn/__init__.py
@@ -0,0 +1,6 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Sean Bell
+# --------------------------------------------------------
diff --git a/examples/faster-rcnn/lib/rpn/anchor_target_layer.py b/examples/faster-rcnn/lib/rpn/anchor_target_layer.py
new file mode 100644
index 000000000..3934cddf2
--- /dev/null
+++ b/examples/faster-rcnn/lib/rpn/anchor_target_layer.py
@@ -0,0 +1,281 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Sean Bell
+# --------------------------------------------------------
+
+import os
+import caffe
+import yaml
+from fast_rcnn.config import cfg
+import numpy as np
+import numpy.random as npr
+from generate_anchors import generate_anchors
+from utils.cython_bbox import bbox_overlaps
+from fast_rcnn.bbox_transform import bbox_transform
+
+DEBUG = False
+
+class AnchorTargetLayer(caffe.Layer):
+    """
+    Assign anchors to ground-truth targets. Produces anchor classification
+    labels and bounding-box regression targets.
+    """
+
+    def setup(self, bottom, top):
+        layer_params = yaml.load(self.param_str)
+        anchor_scales = layer_params.get('scales', (8, 16, 32))
+        self._anchors = generate_anchors(scales=np.array(anchor_scales))
+        self._num_anchors = self._anchors.shape[0]
+        self._feat_stride = layer_params['feat_stride']
+
+        if DEBUG:
+            print 'anchors:'
+            print self._anchors
+            print 'anchor shapes:'
+            print np.hstack((
+                self._anchors[:, 2::4] - self._anchors[:, 0::4],
+                self._anchors[:, 3::4] - self._anchors[:, 1::4],
+            ))
+            self._counts = cfg.EPS
+            self._sums = np.zeros((1, 4))
+            self._squared_sums = np.zeros((1, 4))
+            self._fg_sum = 0
+            self._bg_sum = 0
+            self._count = 0
+
+        # allow boxes to sit over the edge by a small amount
+        self._allowed_border = layer_params.get('allowed_border', 0)
+
+        height, width = bottom[0].data.shape[-2:]
+        if DEBUG:
+            print 'AnchorTargetLayer: height', height, 'width', width
+
+        A = self._num_anchors
+        # labels
+        top[0].reshape(1, 1, A * height, width)
+        # bbox_targets
+        top[1].reshape(1, A * 4, height, width)
+        # bbox_inside_weights
+        top[2].reshape(1, A * 4, height, width)
+        # bbox_outside_weights
+        top[3].reshape(1, A * 4, height, width)
+
+    def forward(self, bottom, top):
+        # Algorithm:
+        #
+        # for each (H, W) location i
+        #   generate 9 anchor boxes centered on cell i
+        #   apply predicted bbox deltas at cell i to each of the 9 anchors
+        # filter out-of-image anchors
+        # measure GT overlap
+
+        assert bottom[0].data.shape[0] == 1, \
+            'Only single item batches are supported'
+
+        # map of shape (..., H, W)
+        height, width = bottom[0].data.shape[-2:]
+        # GT boxes (x1, y1, x2, y2, label)
+        gt_boxes = bottom[1].data
+        # im_info
+        im_info = bottom[2].data[0, :]
+
+        if DEBUG:
+            print ''
+            print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
+            print 'scale: {}'.format(im_info[2])
+            print 'height, width: ({}, {})'.format(height, width)
+            print 'rpn: gt_boxes.shape', gt_boxes.shape
+            print 'rpn: gt_boxes', gt_boxes
+
+        # 1. Generate proposals from bbox deltas and shifted anchors
+        shift_x = np.arange(0, width) * self._feat_stride
+        shift_y = np.arange(0, height) * self._feat_stride
+        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
+                            shift_x.ravel(), shift_y.ravel())).transpose()
+        # add A anchors (1, A, 4) to
+        # cell K shifts (K, 1, 4) to get
+        # shift anchors (K, A, 4)
+        # reshape to (K*A, 4) shifted anchors
+        A = self._num_anchors
+        K = shifts.shape[0]
+        all_anchors = (self._anchors.reshape((1, A, 4)) +
+                       shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
+        all_anchors = all_anchors.reshape((K * A, 4))
+        total_anchors = int(K * A)
+
+        # only keep anchors inside the image
+        inds_inside = np.where(
+            (all_anchors[:, 0] >= -self._allowed_border) &
+            (all_anchors[:, 1] >= -self._allowed_border) &
+            (all_anchors[:, 2] < im_info[1] + self._allowed_border) &  # width
+            (all_anchors[:, 3] < im_info[0] + self._allowed_border)    # height
+        )[0]
+
+        if DEBUG:
+            print 'total_anchors', total_anchors
+            print 'inds_inside', len(inds_inside)
+
+        # keep only inside anchors
+        anchors = all_anchors[inds_inside, :]
+        if DEBUG:
+            print 'anchors.shape', anchors.shape
+
+        # label: 1 is positive, 0 is negative, -1 is dont care
+        labels = np.empty((len(inds_inside), ), dtype=np.float32)
+        labels.fill(-1)
+
+        # overlaps between the anchors and the gt boxes
+        # overlaps (ex, gt)
+        overlaps = bbox_overlaps(
+            np.ascontiguousarray(anchors, dtype=np.float),
+            np.ascontiguousarray(gt_boxes, dtype=np.float))
+        argmax_overlaps = overlaps.argmax(axis=1)
+        max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
+        gt_argmax_overlaps = overlaps.argmax(axis=0)
+        gt_max_overlaps = overlaps[gt_argmax_overlaps,
+                                   np.arange(overlaps.shape[1])]
+        gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+
+        if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
+            # assign bg labels first so that positive labels can clobber them
+            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+
+        # fg label: for each gt, anchor with highest overlap
+        labels[gt_argmax_overlaps] = 1
+
+        # fg label: above threshold IOU
+        labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
+
+        if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
+            # assign bg labels last so that negative labels can clobber positives
+            labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+
+        # subsample positive labels if we have too many
+        num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
+        fg_inds = np.where(labels == 1)[0]
+        if len(fg_inds) > num_fg:
+            disable_inds = npr.choice(
+                fg_inds, size=(len(fg_inds) - num_fg), replace=False)
+            labels[disable_inds] = -1
+
+        # subsample negative labels if we have too many
+        num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
+        bg_inds = np.where(labels == 0)[0]
+        if len(bg_inds) > num_bg:
+            disable_inds = npr.choice(
+                bg_inds, size=(len(bg_inds) - num_bg), replace=False)
+            labels[disable_inds] = -1
+            #print "was %s inds, disabling %s, now %s inds" % (
+                #len(bg_inds), len(disable_inds), np.sum(labels == 0))
+
+        bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
+        bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
+
+        bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
+        bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)
+
+        bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
+        if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:
+            # uniform weighting of examples (given non-uniform sampling)
+            num_examples = np.sum(labels >= 0)
+            positive_weights = np.ones((1, 4)) * 1.0 / num_examples
+            negative_weights = np.ones((1, 4)) * 1.0 / num_examples
+        else:
+            assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
+                    (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
+            positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
+                                np.sum(labels == 1))
+            negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
+                                np.sum(labels == 0))
+        bbox_outside_weights[labels == 1, :] = positive_weights
+        bbox_outside_weights[labels == 0, :] = negative_weights
+
+        if DEBUG:
+            self._sums += bbox_targets[labels == 1, :].sum(axis=0)
+            self._squared_sums += (bbox_targets[labels == 1, :] ** 2).sum(axis=0)
+            self._counts += np.sum(labels == 1)
+            means = self._sums / self._counts
+            stds = np.sqrt(self._squared_sums / self._counts - means ** 2)
+            print 'means:'
+            print means
+            print 'stdevs:'
+            print stds
+
+        # map up to original set of anchors
+        labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
+        bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
+        bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
+        bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
+
+        if DEBUG:
+            print 'rpn: max max_overlap', np.max(max_overlaps)
+            print 'rpn: num_positive', np.sum(labels == 1)
+            print 'rpn: num_negative', np.sum(labels == 0)
+            self._fg_sum += np.sum(labels == 1)
+            self._bg_sum += np.sum(labels == 0)
+            self._count += 1
+            print 'rpn: num_positive avg', self._fg_sum / self._count
+            print 'rpn: num_negative avg', self._bg_sum / self._count
+
+        # labels
+        labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
+        labels = labels.reshape((1, 1, A * height, width))
+        top[0].reshape(*labels.shape)
+        top[0].data[...] = labels
+
+        # bbox_targets
+        bbox_targets = bbox_targets \
+            .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
+        top[1].reshape(*bbox_targets.shape)
+        top[1].data[...] = bbox_targets
+
+        # bbox_inside_weights
+        bbox_inside_weights = bbox_inside_weights \
+            .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
+        assert bbox_inside_weights.shape[2] == height
+        assert bbox_inside_weights.shape[3] == width
+        top[2].reshape(*bbox_inside_weights.shape)
+        top[2].data[...] = bbox_inside_weights
+
+        # bbox_outside_weights
+        bbox_outside_weights = bbox_outside_weights \
+            .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
+        assert bbox_outside_weights.shape[2] == height
+        assert bbox_outside_weights.shape[3] == width
+        top[3].reshape(*bbox_outside_weights.shape)
+        top[3].data[...] = bbox_outside_weights
+
+    def backward(self, top, propagate_down, bottom):
+        """This layer does not propagate gradients."""
+        pass
+
+    def reshape(self, bottom, top):
+        """Reshaping happens during the call to forward."""
+        pass
+
+
+def _unmap(data, count, inds, fill=0):
+    """ Unmap a subset of item (data) back to the original set of items (of
+    size count) """
+    if len(data.shape) == 1:
+        ret = np.empty((count, ), dtype=np.float32)
+        ret.fill(fill)
+        ret[inds] = data
+    else:
+        ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
+        ret.fill(fill)
+        ret[inds, :] = data
+    return ret
+
+
+def _compute_targets(ex_rois, gt_rois):
+    """Compute bounding-box regression targets for an image."""
+
+    assert ex_rois.shape[0] == gt_rois.shape[0]
+    assert ex_rois.shape[1] == 4
+    assert gt_rois.shape[1] == 5
+
+    return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)
diff --git a/examples/faster-rcnn/lib/rpn/generate.py b/examples/faster-rcnn/lib/rpn/generate.py
new file mode 100644
index 000000000..060daf434
--- /dev/null
+++ b/examples/faster-rcnn/lib/rpn/generate.py
@@ -0,0 +1,117 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+from fast_rcnn.config import cfg
+from utils.blob import im_list_to_blob
+from utils.timer import Timer
+import numpy as np
+import cv2
+
+def _vis_proposals(im, dets, thresh=0.5):
+    """Draw detected bounding boxes."""
+    inds = np.where(dets[:, -1] >= thresh)[0]
+    if len(inds) == 0:
+        return
+
+    class_name = 'obj'
+    im = im[:, :, (2, 1, 0)]
+    fig, ax = plt.subplots(figsize=(12, 12))
+    ax.imshow(im, aspect='equal')
+    for i in inds:
+        bbox = dets[i, :4]
+        score = dets[i, -1]
+
+        ax.add_patch(
+            plt.Rectangle((bbox[0], bbox[1]),
+                          bbox[2] - bbox[0],
+                          bbox[3] - bbox[1], fill=False,
+                          edgecolor='red', linewidth=3.5)
+            )
+        ax.text(bbox[0], bbox[1] - 2,
+                '{:s} {:.3f}'.format(class_name, score),
+                bbox=dict(facecolor='blue', alpha=0.5),
+                fontsize=14, color='white')
+
+    ax.set_title(('{} detections with '
+                  'p({} | box) >= {:.1f}').format(class_name, class_name,
+                                                  thresh),
+                  fontsize=14)
+    plt.axis('off')
+    plt.tight_layout()
+    plt.draw()
+
+def _get_image_blob(im):
+    """Converts an image into a network input.
+
+    Arguments:
+        im (ndarray): a color image in BGR order
+
+    Returns:
+        blob (ndarray): a data blob holding an image pyramid
+        im_scale_factors (list): list of image scales (relative to im) used
+            in the image pyramid
+    """
+    im_orig = im.astype(np.float32, copy=True)
+    im_orig -= cfg.PIXEL_MEANS
+
+    im_shape = im_orig.shape
+    im_size_min = np.min(im_shape[0:2])
+    im_size_max = np.max(im_shape[0:2])
+
+    processed_ims = []
+
+    assert len(cfg.TEST.SCALES) == 1
+    target_size = cfg.TEST.SCALES[0]
+
+    im_scale = float(target_size) / float(im_size_min)
+    # Prevent the biggest axis from being more than MAX_SIZE
+    if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
+        im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
+    im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
+                    interpolation=cv2.INTER_LINEAR)
+    im_info = np.hstack((im.shape[:2], im_scale))[np.newaxis, :]
+    processed_ims.append(im)
+
+    # Create a blob to hold the input images
+    blob = im_list_to_blob(processed_ims)
+
+    return blob, im_info
+
+def im_proposals(net, im):
+    """Generate RPN proposals on a single image."""
+    blobs = {}
+    blobs['data'], blobs['im_info'] = _get_image_blob(im)
+    net.blobs['data'].reshape(*(blobs['data'].shape))
+    net.blobs['im_info'].reshape(*(blobs['im_info'].shape))
+    blobs_out = net.forward(
+            data=blobs['data'].astype(np.float32, copy=False),
+            im_info=blobs['im_info'].astype(np.float32, copy=False))
+
+    scale = blobs['im_info'][0, 2]
+    boxes = blobs_out['rois'][:, 1:].copy() / scale
+    scores = blobs_out['scores'].copy()
+    return boxes, scores
+
+def imdb_proposals(net, imdb):
+    """Generate RPN proposals on all images in an imdb."""
+
+    _t = Timer()
+    imdb_boxes = [[] for _ in xrange(imdb.num_images)]
+    for i in xrange(imdb.num_images):
+        im = cv2.imread(imdb.image_path_at(i))
+        _t.tic()
+        imdb_boxes[i], scores = im_proposals(net, im)
+        _t.toc()
+        print 'im_proposals: {:d}/{:d} {:.3f}s' \
+              .format(i + 1, imdb.num_images, _t.average_time)
+        if 0:
+            dets = np.hstack((imdb_boxes[i], scores))
+            # from IPython import embed; embed()
+            _vis_proposals(im, dets[:3, :], thresh=0.9)
+            plt.show()
+
+    return imdb_boxes
diff --git a/examples/faster-rcnn/lib/rpn/generate_anchors.py b/examples/faster-rcnn/lib/rpn/generate_anchors.py
new file mode 100644
index 000000000..1125a801f
--- /dev/null
+++ b/examples/faster-rcnn/lib/rpn/generate_anchors.py
@@ -0,0 +1,105 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Sean Bell
+# --------------------------------------------------------
+
+import numpy as np
+
+# Verify that we compute the same anchors as Shaoqing's matlab implementation:
+#
+#    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
+#    >> anchors
+#
+#    anchors =
+#
+#       -83   -39   100    56
+#      -175   -87   192   104
+#      -359  -183   376   200
+#       -55   -55    72    72
+#      -119  -119   136   136
+#      -247  -247   264   264
+#       -35   -79    52    96
+#       -79  -167    96   184
+#      -167  -343   184   360
+
+#array([[ -83.,  -39.,  100.,   56.],
+#       [-175.,  -87.,  192.,  104.],
+#       [-359., -183.,  376.,  200.],
+#       [ -55.,  -55.,   72.,   72.],
+#       [-119., -119.,  136.,  136.],
+#       [-247., -247.,  264.,  264.],
+#       [ -35.,  -79.,   52.,   96.],
+#       [ -79., -167.,   96.,  184.],
+#       [-167., -343.,  184.,  360.]])
+
+def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
+                     scales=2**np.arange(3, 6)):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+
+    base_anchor = np.array([1, 1, base_size, base_size]) - 1
+    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
+                         for i in xrange(ratio_anchors.shape[0])])
+    return anchors
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
+                         y_ctr - 0.5 * (hs - 1),
+                         x_ctr + 0.5 * (ws - 1),
+                         y_ctr + 0.5 * (hs - 1)))
+    return anchors
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+if __name__ == '__main__':
+    import time
+    t = time.time()
+    a = generate_anchors()
+    print time.time() - t
+    print a
+    from IPython import embed; embed()
diff --git a/examples/faster-rcnn/lib/rpn/proposal_layer.py b/examples/faster-rcnn/lib/rpn/proposal_layer.py
new file mode 100644
index 000000000..2ce98bd79
--- /dev/null
+++ b/examples/faster-rcnn/lib/rpn/proposal_layer.py
@@ -0,0 +1,177 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Sean Bell
+# --------------------------------------------------------
+
+import caffe
+import numpy as np
+import yaml
+from fast_rcnn.config import cfg
+from generate_anchors import generate_anchors
+from fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes
+from fast_rcnn.nms_wrapper import nms
+
+DEBUG = False
+
+class ProposalLayer(caffe.Layer):
+    """
+    Outputs object detection proposals by applying estimated bounding-box
+    transformations to a set of regular boxes (called "anchors").
+    """
+
+    def setup(self, bottom, top):
+        # parse the layer parameter string, which must be valid YAML
+        layer_params = yaml.load(self.param_str)
+
+        self._feat_stride = layer_params['feat_stride']
+        anchor_scales = layer_params.get('scales', (8, 16, 32))
+        self._anchors = generate_anchors(scales=np.array(anchor_scales))
+        self._num_anchors = self._anchors.shape[0]
+
+        if DEBUG:
+            print 'feat_stride: {}'.format(self._feat_stride)
+            print 'anchors:'
+            print self._anchors
+
+        # rois blob: holds R regions of interest, each is a 5-tuple
+        # (n, x1, y1, x2, y2) specifying an image batch index n and a
+        # rectangle (x1, y1, x2, y2)
+        top[0].reshape(1, 5)
+
+        # scores blob: holds scores for R regions of interest
+        if len(top) > 1:
+            top[1].reshape(1, 1, 1, 1)
+
+    def forward(self, bottom, top):
+        # Algorithm:
+        #
+        # for each (H, W) location i
+        #   generate A anchor boxes centered on cell i
+        #   apply predicted bbox deltas at cell i to each of the A anchors
+        # clip predicted boxes to image
+        # remove predicted boxes with either height or width < threshold
+        # sort all (proposal, score) pairs by score from highest to lowest
+        # take top pre_nms_topN proposals before NMS
+        # apply NMS with threshold 0.7 to remaining proposals
+        # take after_nms_topN proposals after NMS
+        # return the top proposals (-> RoIs top, scores top)
+
+        assert bottom[0].data.shape[0] == 1, \
+            'Only single item batches are supported'
+
+        #cfg_key = str(self.phase) # either 'TRAIN' or 'TEST'
+        cfg_key = str('TRAIN' if self.phase == 0 else 'TEST')
+        pre_nms_topN  = cfg[cfg_key].RPN_PRE_NMS_TOP_N
+        post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
+        nms_thresh    = cfg[cfg_key].RPN_NMS_THRESH
+        min_size      = cfg[cfg_key].RPN_MIN_SIZE
+
+        # the first set of _num_anchors channels are bg probs
+        # the second set are the fg probs, which we want
+        scores = bottom[0].data[:, self._num_anchors:, :, :]
+        bbox_deltas = bottom[1].data
+        im_info = bottom[2].data[0, :]
+
+        if DEBUG:
+            print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
+            print 'scale: {}'.format(im_info[2])
+
+        # 1. Generate proposals from bbox deltas and shifted anchors
+        height, width = scores.shape[-2:]
+
+        if DEBUG:
+            print 'score map size: {}'.format(scores.shape)
+
+        # Enumerate all shifts
+        shift_x = np.arange(0, width) * self._feat_stride
+        shift_y = np.arange(0, height) * self._feat_stride
+        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
+                            shift_x.ravel(), shift_y.ravel())).transpose()
+
+        # Enumerate all shifted anchors:
+        #
+        # add A anchors (1, A, 4) to
+        # cell K shifts (K, 1, 4) to get
+        # shift anchors (K, A, 4)
+        # reshape to (K*A, 4) shifted anchors
+        A = self._num_anchors
+        K = shifts.shape[0]
+        anchors = self._anchors.reshape((1, A, 4)) + \
+                  shifts.reshape((1, K, 4)).transpose((1, 0, 2))
+        anchors = anchors.reshape((K * A, 4))
+
+        # Transpose and reshape predicted bbox transformations to get them
+        # into the same order as the anchors:
+        #
+        # bbox deltas will be (1, 4 * A, H, W) format
+        # transpose to (1, H, W, 4 * A)
+        # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a)
+        # in slowest to fastest order
+        bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
+
+        # Same story for the scores:
+        #
+        # scores are (1, A, H, W) format
+        # transpose to (1, H, W, A)
+        # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a)
+        scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
+
+        # Convert anchors into proposals via bbox transformations
+        proposals = bbox_transform_inv(anchors, bbox_deltas)
+
+        # 2. clip predicted boxes to image
+        proposals = clip_boxes(proposals, im_info[:2])
+
+        # 3. remove predicted boxes with either height or width < threshold
+        # (NOTE: convert min_size to input image scale stored in im_info[2])
+        keep = _filter_boxes(proposals, min_size * im_info[2])
+        proposals = proposals[keep, :]
+        scores = scores[keep]
+
+        # 4. sort all (proposal, score) pairs by score from highest to lowest
+        # 5. take top pre_nms_topN (e.g. 6000)
+        order = scores.ravel().argsort()[::-1]
+        if pre_nms_topN > 0:
+            order = order[:pre_nms_topN]
+        proposals = proposals[order, :]
+        scores = scores[order]
+
+        # 6. apply nms (e.g. threshold = 0.7)
+        # 7. take after_nms_topN (e.g. 300)
+        # 8. return the top proposals (-> RoIs top)
+        keep = nms(np.hstack((proposals, scores)), nms_thresh)
+        if post_nms_topN > 0:
+            keep = keep[:post_nms_topN]
+        proposals = proposals[keep, :]
+        scores = scores[keep]
+
+        # Output rois blob
+        # Our RPN implementation only supports a single input image, so all
+        # batch inds are 0
+        batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
+        blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
+        top[0].reshape(*(blob.shape))
+        top[0].data[...] = blob
+
+        # [Optional] output scores blob
+        if len(top) > 1:
+            top[1].reshape(*(scores.shape))
+            top[1].data[...] = scores
+
+    def backward(self, top, propagate_down, bottom):
+        """This layer does not propagate gradients."""
+        pass
+
+    def reshape(self, bottom, top):
+        """Reshaping happens during the call to forward."""
+        pass
+
+def _filter_boxes(boxes, min_size):
+    """Remove all boxes with any side smaller than min_size."""
+    ws = boxes[:, 2] - boxes[:, 0] + 1
+    hs = boxes[:, 3] - boxes[:, 1] + 1
+    keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+    return keep
diff --git a/examples/faster-rcnn/lib/rpn/proposal_target_layer.py b/examples/faster-rcnn/lib/rpn/proposal_target_layer.py
new file mode 100644
index 000000000..f474ce96b
--- /dev/null
+++ b/examples/faster-rcnn/lib/rpn/proposal_target_layer.py
@@ -0,0 +1,193 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Sean Bell
+# --------------------------------------------------------
+
+import caffe
+import yaml
+import numpy as np
+import numpy.random as npr
+from fast_rcnn.config import cfg
+from fast_rcnn.bbox_transform import bbox_transform
+from utils.cython_bbox import bbox_overlaps
+
+DEBUG = False
+
+class ProposalTargetLayer(caffe.Layer):
+    """
+    Assign object detection proposals to ground-truth targets. Produces proposal
+    classification labels and bounding-box regression targets.
+    """
+
+    def setup(self, bottom, top):
+        layer_params = yaml.load(self.param_str)
+        self._num_classes = layer_params['num_classes']
+
+        # sampled rois (0, x1, y1, x2, y2)
+        top[0].reshape(1, 5)
+        # labels
+        top[1].reshape(1, 1)
+        # bbox_targets
+        top[2].reshape(1, self._num_classes * 4)
+        # bbox_inside_weights
+        top[3].reshape(1, self._num_classes * 4)
+        # bbox_outside_weights
+        top[4].reshape(1, self._num_classes * 4)
+
+    def forward(self, bottom, top):
+        # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
+        # (i.e., rpn.proposal_layer.ProposalLayer), or any other source
+        all_rois = bottom[0].data
+        # GT boxes (x1, y1, x2, y2, label)
+        # TODO(rbg): it's annoying that sometimes I have extra info before
+        # and other times after box coordinates -- normalize to one format
+        gt_boxes = bottom[1].data
+
+        # Include ground-truth boxes in the set of candidate rois
+        zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
+        all_rois = np.vstack(
+            (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
+        )
+
+        # Sanity check: single batch only
+        assert np.all(all_rois[:, 0] == 0), \
+                'Only single item batches are supported'
+
+        num_images = 1
+        rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
+        fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
+
+        # Sample rois with classification labels and bounding box regression
+        # targets
+        labels, rois, bbox_targets, bbox_inside_weights = _sample_rois(
+            all_rois, gt_boxes, fg_rois_per_image,
+            rois_per_image, self._num_classes)
+
+        if DEBUG:
+            print 'num fg: {}'.format((labels > 0).sum())
+            print 'num bg: {}'.format((labels == 0).sum())
+            self._count += 1
+            self._fg_num += (labels > 0).sum()
+            self._bg_num += (labels == 0).sum()
+            print 'num fg avg: {}'.format(self._fg_num / self._count)
+            print 'num bg avg: {}'.format(self._bg_num / self._count)
+            print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num))
+
+        # sampled rois
+        top[0].reshape(*rois.shape)
+        top[0].data[...] = rois
+
+        # classification labels
+        top[1].reshape(*labels.shape)
+        top[1].data[...] = labels
+
+        # bbox_targets
+        top[2].reshape(*bbox_targets.shape)
+        top[2].data[...] = bbox_targets
+
+        # bbox_inside_weights
+        top[3].reshape(*bbox_inside_weights.shape)
+        top[3].data[...] = bbox_inside_weights
+
+        # bbox_outside_weights
+        top[4].reshape(*bbox_inside_weights.shape)
+        top[4].data[...] = np.array(bbox_inside_weights > 0).astype(np.float32)
+
+    def backward(self, top, propagate_down, bottom):
+        """This layer does not propagate gradients."""
+        pass
+
+    def reshape(self, bottom, top):
+        """Reshaping happens during the call to forward."""
+        pass
+
+
+def _get_bbox_regression_labels(bbox_target_data, num_classes):
+    """Bounding-box regression targets (bbox_target_data) are stored in a
+    compact form N x (class, tx, ty, tw, th)
+
+    This function expands those targets into the 4-of-4*K representation used
+    by the network (i.e. only one class has non-zero targets).
+
+    Returns:
+        bbox_target (ndarray): N x 4K blob of regression targets
+        bbox_inside_weights (ndarray): N x 4K blob of loss weights
+    """
+
+    clss = bbox_target_data[:, 0]
+    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
+    bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
+    inds = np.where(clss > 0)[0]
+    for ind in inds:
+        cls = clss[ind]
+        start = int(4 * cls)
+        end = int(start + 4)
+        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
+        bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
+    return bbox_targets, bbox_inside_weights
+
+
+def _compute_targets(ex_rois, gt_rois, labels):
+    """Compute bounding-box regression targets for an image."""
+
+    assert ex_rois.shape[0] == gt_rois.shape[0]
+    assert ex_rois.shape[1] == 4
+    assert gt_rois.shape[1] == 4
+
+    targets = bbox_transform(ex_rois, gt_rois)
+    if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
+        # Optionally normalize targets by a precomputed mean and stdev
+        targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
+                / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
+    return np.hstack(
+            (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
+
+def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
+    """Generate a random sample of RoIs comprising foreground and background
+    examples.
+    """
+    # overlaps: (rois x gt_boxes)
+    overlaps = bbox_overlaps(
+        np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
+        np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
+    gt_assignment = overlaps.argmax(axis=1)
+    max_overlaps = overlaps.max(axis=1)
+    labels = gt_boxes[gt_assignment, 4]
+
+    # Select foreground RoIs as those with >= FG_THRESH overlap
+    fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
+    # Guard against the case when an image has fewer than fg_rois_per_image
+    # foreground RoIs
+    fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size)
+    # Sample foreground regions without replacement
+    if fg_inds.size > 0:
+        fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_this_image), replace=False)
+
+    # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
+    bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
+                       (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
+    # Compute number of background RoIs to take from this image (guarding
+    # against there being fewer than desired)
+    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
+    bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
+    # Sample background regions without replacement
+    if bg_inds.size > 0:
+        bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_this_image), replace=False)
+
+    # The indices that we're selecting (both fg and bg)
+    keep_inds = np.append(fg_inds, bg_inds)
+    # Select sampled values from various arrays:
+    labels = labels[keep_inds]
+    # Clamp labels for the background RoIs to 0
+    labels[int(fg_rois_per_this_image):] = 0
+    rois = all_rois[keep_inds]
+
+    bbox_target_data = _compute_targets(
+        rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
+
+    bbox_targets, bbox_inside_weights = \
+        _get_bbox_regression_labels(bbox_target_data, num_classes)
+
+    return labels, rois, bbox_targets, bbox_inside_weights
diff --git a/examples/faster-rcnn/lib/setup.py b/examples/faster-rcnn/lib/setup.py
new file mode 100644
index 000000000..df9a0db76
--- /dev/null
+++ b/examples/faster-rcnn/lib/setup.py
@@ -0,0 +1,162 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import os
+from os.path import join as pjoin
+from setuptools import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
+import subprocess
+import numpy as np
+
+def find_in_path(name, path):
+    "Find a file in a search path"
+    # Adapted fom
+    # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
+    for dir in path.split(os.pathsep):
+        binpath = pjoin(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+
+    Starts by looking for the CUDAHOME env variable. If not found, everything
+    is based on finding 'nvcc' in the PATH.
+    """
+
+    # first check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = pjoin(home, 'bin', 'nvcc')
+    else:
+        # otherwise, search the PATH for NVCC
+        default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
+        nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path)
+        if nvcc is None:
+            raise EnvironmentError('The nvcc binary could not be '
+                'located in your $PATH. Either add it to your path, or set $CUDAHOME')
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {'home':home, 'nvcc':nvcc,
+                  'include': pjoin(home, 'include'),
+                  'lib64': pjoin(home, 'lib64')}
+    for k, v in cudaconfig.iteritems():
+        if not os.path.exists(v):
+            raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
+
+    return cudaconfig
+#CUDA = locate_cuda()
+
+
+# Obtain the numpy include directory.  This logic works across numpy versions.
+try:
+    numpy_include = np.get_include()
+except AttributeError:
+    numpy_include = np.get_numpy_include()
+
+def customize_compiler_for_nvcc(self):
+    """inject deep into distutils to customize how the dispatch
+    to gcc/nvcc works.
+
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on."""
+
+    # tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1 translated
+            # from the extra_compile_args in the Extension class
+            postargs = extra_postargs['nvcc']
+        else:
+            postargs = extra_postargs['gcc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # inject our redefined _compile method into the class
+    self._compile = _compile
+
+
+# run the customize_compiler
+class custom_build_ext(build_ext):
+    def build_extensions(self):
+        customize_compiler_for_nvcc(self.compiler)
+        build_ext.build_extensions(self)
+
+
+ext_modules = [
+    Extension(
+        "utils.cython_mathutils",
+        ["utils/mathutils.pyx"],
+        extra_compile_args={'gcc': ["-O3", "-Wno-cpp", "-Wno-unused-function"]},
+        include_dirs = [numpy_include]
+    ),
+    Extension(
+        "utils.cython_bbox",
+        ["utils/bbox.pyx"],
+        extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
+        include_dirs = [numpy_include]
+    ),
+    Extension(
+        "nms.cpu_nms",
+        ["nms/cpu_nms.pyx"],
+        extra_compile_args={'gcc': ["-O3", "-Wno-cpp", "-Wno-unused-function"]},
+        include_dirs = [numpy_include]
+    ),
+   # Extension('nms.gpu_nms',
+   #     ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],
+   #     library_dirs=[CUDA['lib64']],
+   #     libraries=['cudart'],
+   #     language='c++',
+   #     runtime_library_dirs=[CUDA['lib64']],
+   #     # this syntax is specific to this build system
+   #     # we're only going to use certain compiler args with nvcc and not with
+   #     # gcc the implementation of this trick is in customize_compiler() below
+   #     extra_compile_args={'gcc': ["-Wno-unused-function"],
+   #                         'nvcc': ['-arch=sm_35',
+   #                                  '--ptxas-options=-v',
+   #                                  '-c',
+   #                                  '--compiler-options',
+   #                                  "'-fPIC'"]},
+   #     include_dirs = [numpy_include, CUDA['include']]
+   # ),
+    Extension(
+        'pycocotools._mask',
+        sources=['pycocotools/maskApi.c', 'pycocotools/_mask.pyx'],
+        include_dirs = [numpy_include, 'pycocotools'],
+        extra_compile_args={
+            'gcc': ['-Wno-cpp', '-Wno-unused-function', '-std=c99']},
+    ),
+]
+
+setup(
+    name='fast_rcnn',
+    ext_modules=ext_modules,
+    # inject our custom trigger
+    cmdclass={'build_ext': custom_build_ext},
+)
diff --git a/examples/faster-rcnn/lib/transform/__init__.py b/examples/faster-rcnn/lib/transform/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/faster-rcnn/lib/transform/torch_image_transform_layer.py b/examples/faster-rcnn/lib/transform/torch_image_transform_layer.py
new file mode 100644
index 000000000..9273b3a76
--- /dev/null
+++ b/examples/faster-rcnn/lib/transform/torch_image_transform_layer.py
@@ -0,0 +1,64 @@
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+
+""" Transform images for compatibility with models trained with
+https://github.com/facebook/fb.resnet.torch.
+
+Usage in model prototxt:
+
+layer {
+  name: 'data_xform'
+  type: 'Python'
+  bottom: 'data_caffe'
+  top: 'data'
+  python_param {
+    module: 'transform.torch_image_transform_layer'
+    layer: 'TorchImageTransformLayer'
+  }
+}
+"""
+
+import caffe
+from fast_rcnn.config import cfg
+import numpy as np
+
+class TorchImageTransformLayer(caffe.Layer):
+    def setup(self, bottom, top):
+        # (1, 3, 1, 1) shaped arrays
+        self.PIXEL_MEANS = \
+            np.array([[[[0.48462227599918]],
+                       [[0.45624044862054]],
+                       [[0.40588363755159]]]])
+        self.PIXEL_STDS = \
+            np.array([[[[0.22889466674951]],
+                       [[0.22446679341259]],
+                       [[0.22495548344775]]]])
+        # The default ("old") pixel means that were already subtracted
+        channel_swap = (0, 3, 1, 2)
+        self.OLD_PIXEL_MEANS = \
+            cfg.PIXEL_MEANS[np.newaxis, :, :, :].transpose(channel_swap)
+
+        top[0].reshape(*(bottom[0].shape))
+
+    def forward(self, bottom, top):
+        ims = bottom[0].data
+        # Invert the channel means that were already subtracted
+        ims += self.OLD_PIXEL_MEANS
+        # 1. Permute BGR to RGB and normalize to [0, 1]
+        ims = ims[:, [2, 1, 0], :, :] / 255.0
+        # 2. Remove channel means
+        ims -= self.PIXEL_MEANS
+        # 3. Standardize channels
+        ims /= self.PIXEL_STDS
+        top[0].reshape(*(ims.shape))
+        top[0].data[...] = ims
+
+    def backward(self, top, propagate_down, bottom):
+        """This layer does not propagate gradients."""
+        pass
+
+    def reshape(self, bottom, top):
+        """Reshaping happens during the call to forward."""
+        pass
diff --git a/examples/faster-rcnn/lib/utils/__init__.py b/examples/faster-rcnn/lib/utils/__init__.py
new file mode 100644
index 000000000..7ba6a65c5
--- /dev/null
+++ b/examples/faster-rcnn/lib/utils/__init__.py
@@ -0,0 +1,6 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
diff --git a/examples/faster-rcnn/lib/utils/bbox.pyx b/examples/faster-rcnn/lib/utils/bbox.pyx
new file mode 100644
index 000000000..e14780ddd
--- /dev/null
+++ b/examples/faster-rcnn/lib/utils/bbox.pyx
@@ -0,0 +1,55 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Sergey Karayev
+# --------------------------------------------------------
+
+cimport cython
+import numpy as np
+cimport numpy as np
+
+DTYPE = np.float
+ctypedef np.float_t DTYPE_t
+
+def bbox_overlaps(
+        np.ndarray[DTYPE_t, ndim=2] boxes,
+        np.ndarray[DTYPE_t, ndim=2] query_boxes):
+    """
+    Parameters
+    ----------
+    boxes: (N, 4) ndarray of float
+    query_boxes: (K, 4) ndarray of float
+    Returns
+    -------
+    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    cdef unsigned int N = boxes.shape[0]
+    cdef unsigned int K = query_boxes.shape[0]
+    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
+    cdef DTYPE_t iw, ih, box_area
+    cdef DTYPE_t ua
+    cdef unsigned int k, n
+    for k in range(K):
+        box_area = (
+            (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
+            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        )
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + 1
+            )
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + 1
+                )
+                if ih > 0:
+                    ua = float(
+                        (boxes[n, 2] - boxes[n, 0] + 1) *
+                        (boxes[n, 3] - boxes[n, 1] + 1) +
+                        box_area - iw * ih
+                    )
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
diff --git a/examples/faster-rcnn/lib/utils/blob.py b/examples/faster-rcnn/lib/utils/blob.py
new file mode 100644
index 000000000..249d179f0
--- /dev/null
+++ b/examples/faster-rcnn/lib/utils/blob.py
@@ -0,0 +1,48 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Blob helper functions."""
+
+import numpy as np
+import cv2
+from utils.cython_mathutils import cpu_subtract
+
+def im_list_to_blob(ims):
+    """Convert a list of images into a network input.
+
+    Assumes images are already prepared (means subtracted, BGR order, ...).
+    """
+    max_shape = np.array([im.shape for im in ims]).max(axis=0)
+    num_images = len(ims)
+    blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
+                    dtype=np.float32)
+    for i in xrange(num_images):
+        im = ims[i]
+        blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
+    # Move channels (axis 3) to axis 1
+    # Axis order will become: (batch elem, channel, height, width)
+    channel_swap = (0, 3, 1, 2)
+    blob = blob.transpose(channel_swap)
+    return blob
+
+def prep_im_for_blob(im, pixel_means, target_size, max_size):
+    """Mean subtract and scale an image for use in a blob."""
+    im = im.astype(np.float32, copy=False)
+    pixel_means = pixel_means.astype(np.float32, copy=False)
+    # im -= pixel_means
+    cpu_subtract(im, pixel_means)
+    im_shape = im.shape
+    im_size_min = np.min(im_shape[0:2])
+    im_size_max = np.max(im_shape[0:2])
+    im_scale = float(target_size) / float(im_size_min)
+    # Prevent the biggest axis from being more than MAX_SIZE
+    if np.round(im_scale * im_size_max) > max_size:
+        im_scale = float(max_size) / float(im_size_max)
+    im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
+                    interpolation=cv2.INTER_LINEAR)
+
+    return im, im_scale
diff --git a/examples/faster-rcnn/lib/utils/mathutils.pyx b/examples/faster-rcnn/lib/utils/mathutils.pyx
new file mode 100644
index 000000000..13f410d9a
--- /dev/null
+++ b/examples/faster-rcnn/lib/utils/mathutils.pyx
@@ -0,0 +1,41 @@
+# --------------------------------------------------------
+# Math Utils
+# Copyright (c) 2017 Intel
+# Licensed under The MIT License [see LICENSE for details]
+# Written by YAO Matrix
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+cimport cython
+cimport openmp
+
+from cython.parallel cimport prange
+from cython.parallel cimport parallel
+
+import os
+from multiprocessing import cpu_count
+cdef int thread_num = 0
+try:
+    thread_num = os.environ["OMP_NUM_THREADS"]
+except:
+    thread_num = cpu_count() / 2
+
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def cpu_subtract(np.ndarray[np.float32_t, ndim=3] src, np.ndarray[np.float32_t, ndim=3] scalar):
+    global thread_num
+
+    cdef unsigned int rows = src.shape[0]
+    cdef unsigned int cols = src.shape[1]
+    cdef unsigned int chs = src.shape[2]
+
+    cdef int i, j, k
+
+    with nogil:
+        for i in prange(rows, schedule = 'dynamic', num_threads = thread_num):
+            for j in xrange(cols):
+                for k in xrange(chs):
+                    src[i, j, k] -= scalar[0, 0, k]
diff --git a/examples/faster-rcnn/lib/utils/timer.py b/examples/faster-rcnn/lib/utils/timer.py
new file mode 100644
index 000000000..dacc942c2
--- /dev/null
+++ b/examples/faster-rcnn/lib/utils/timer.py
@@ -0,0 +1,32 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import time
+
+class Timer(object):
+    """A simple timer."""
+    def __init__(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        self.average_time = 0.
+
+    def tic(self):
+        # using time.time instead of time.clock because time time.clock
+        # does not normalize for multithreading
+        self.start_time = time.time()
+
+    def toc(self, average=True):
+        self.diff = time.time() - self.start_time
+        self.total_time += self.diff
+        self.calls += 1
+        self.average_time = self.total_time / self.calls
+        if average:
+            return self.average_time
+        else:
+            return self.diff
diff --git a/examples/faster-rcnn/tools/README.md b/examples/faster-rcnn/tools/README.md
new file mode 100644
index 000000000..6fd409412
--- /dev/null
+++ b/examples/faster-rcnn/tools/README.md
@@ -0,0 +1 @@
+Tools for training, testing, and compressing Fast R-CNN networks.
diff --git a/examples/faster-rcnn/tools/_init_paths.py b/examples/faster-rcnn/tools/_init_paths.py
new file mode 100644
index 000000000..20577f0d7
--- /dev/null
+++ b/examples/faster-rcnn/tools/_init_paths.py
@@ -0,0 +1,26 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Set up paths for Fast R-CNN."""
+
+import os.path as osp
+import sys
+
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
+this_dir = osp.dirname(__file__)
+
+# Add caffe to PYTHONPATH
+#caffe_path = osp.join(this_dir, '..', 'caffe-fast-rcnn', 'python')
+caffe_path = osp.join(this_dir, '..', '..', '..', 'python')
+add_path(caffe_path)
+
+# Add lib to PYTHONPATH
+lib_path = osp.join(this_dir, '..', 'lib')
+add_path(lib_path)
diff --git a/examples/faster-rcnn/tools/compress_net.py b/examples/faster-rcnn/tools/compress_net.py
new file mode 100755
index 000000000..e044e5bcd
--- /dev/null
+++ b/examples/faster-rcnn/tools/compress_net.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Compress a Fast R-CNN network using truncated SVD."""
+
+import _init_paths
+import caffe
+import argparse
+import numpy as np
+import os, sys
+
+def parse_args():
+    """Parse input arguments."""
+    parser = argparse.ArgumentParser(description='Compress a Fast R-CNN network')
+    parser.add_argument('--def', dest='prototxt',
+                        help='prototxt file defining the uncompressed network',
+                        default=None, type=str)
+    parser.add_argument('--def-svd', dest='prototxt_svd',
+                        help='prototxt file defining the SVD compressed network',
+                        default=None, type=str)
+    parser.add_argument('--net', dest='caffemodel',
+                        help='model to compress',
+                        default=None, type=str)
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+def compress_weights(W, l):
+    """Compress the weight matrix W of an inner product (fully connected) layer
+    using truncated SVD.
+
+    Parameters:
+    W: N x M weights matrix
+    l: number of singular values to retain
+
+    Returns:
+    Ul, L: matrices such that W \approx Ul*L
+    """
+
+    # numpy doesn't seem to have a fast truncated SVD algorithm...
+    # this could be faster
+    U, s, V = np.linalg.svd(W, full_matrices=False)
+
+    Ul = U[:, :l]
+    sl = s[:l]
+    Vl = V[:l, :]
+
+    L = np.dot(np.diag(sl), Vl)
+    return Ul, L
+
+def main():
+    args = parse_args()
+
+    # prototxt = 'models/VGG16/test.prototxt'
+    # caffemodel = 'snapshots/vgg16_fast_rcnn_iter_40000.caffemodel'
+    net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST)
+
+    # prototxt_svd = 'models/VGG16/svd/test_fc6_fc7.prototxt'
+    # caffemodel = 'snapshots/vgg16_fast_rcnn_iter_40000.caffemodel'
+    net_svd = caffe.Net(args.prototxt_svd, args.caffemodel, caffe.TEST)
+
+    print('Uncompressed network {} : {}'.format(args.prototxt, args.caffemodel))
+    print('Compressed network prototxt {}'.format(args.prototxt_svd))
+
+    out = os.path.splitext(os.path.basename(args.caffemodel))[0] + '_svd'
+    out_dir = os.path.dirname(args.caffemodel)
+
+    # Compress fc6
+    if net_svd.params.has_key('fc6_L'):
+        l_fc6 = net_svd.params['fc6_L'][0].data.shape[0]
+        print('  fc6_L bottleneck size: {}'.format(l_fc6))
+
+        # uncompressed weights and biases
+        W_fc6 = net.params['fc6'][0].data
+        B_fc6 = net.params['fc6'][1].data
+
+        print('  compressing fc6...')
+        Ul_fc6, L_fc6 = compress_weights(W_fc6, l_fc6)
+
+        assert(len(net_svd.params['fc6_L']) == 1)
+
+        # install compressed matrix factors (and original biases)
+        net_svd.params['fc6_L'][0].data[...] = L_fc6
+
+        net_svd.params['fc6_U'][0].data[...] = Ul_fc6
+        net_svd.params['fc6_U'][1].data[...] = B_fc6
+
+        out += '_fc6_{}'.format(l_fc6)
+
+    # Compress fc7
+    if net_svd.params.has_key('fc7_L'):
+        l_fc7 = net_svd.params['fc7_L'][0].data.shape[0]
+        print '  fc7_L bottleneck size: {}'.format(l_fc7)
+
+        W_fc7 = net.params['fc7'][0].data
+        B_fc7 = net.params['fc7'][1].data
+
+        print('  compressing fc7...')
+        Ul_fc7, L_fc7 = compress_weights(W_fc7, l_fc7)
+
+        assert(len(net_svd.params['fc7_L']) == 1)
+
+        net_svd.params['fc7_L'][0].data[...] = L_fc7
+
+        net_svd.params['fc7_U'][0].data[...] = Ul_fc7
+        net_svd.params['fc7_U'][1].data[...] = B_fc7
+
+        out += '_fc7_{}'.format(l_fc7)
+
+    filename = '{}/{}.caffemodel'.format(out_dir, out)
+    net_svd.save(filename)
+    print 'Wrote svd model to: {:s}'.format(filename)
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/faster-rcnn/tools/demo.py b/examples/faster-rcnn/tools/demo.py
new file mode 100755
index 000000000..7293b4635
--- /dev/null
+++ b/examples/faster-rcnn/tools/demo.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""
+Demo script showing detections in sample images.
+
+See README.md for installation instructions before running.
+"""
+
+import _init_paths
+from fast_rcnn.config import cfg
+from fast_rcnn.test import im_detect
+from fast_rcnn.nms_wrapper import nms
+from utils.timer import Timer
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.io as sio
+import caffe, os, sys, cv2
+import argparse
+
+CLASSES = ('__background__',
+           'aeroplane', 'bicycle', 'bird', 'boat',
+           'bottle', 'bus', 'car', 'cat', 'chair',
+           'cow', 'diningtable', 'dog', 'horse',
+           'motorbike', 'person', 'pottedplant',
+           'sheep', 'sofa', 'train', 'tvmonitor')
+
+NETS = {'vgg16': ('VGG16',
+                  'VGG16_faster_rcnn_final.caffemodel'),
+        'zf': ('ZF',
+                  'ZF_faster_rcnn_final.caffemodel')}
+
+
+def vis_detections(im, class_name, dets, thresh=0.5):
+    """Draw detected bounding boxes."""
+    inds = np.where(dets[:, -1] >= thresh)[0]
+    if len(inds) == 0:
+        return
+
+    im = im[:, :, (2, 1, 0)]
+    fig, ax = plt.subplots(figsize=(12, 12))
+    ax.imshow(im, aspect='equal')
+    for i in inds:
+        bbox = dets[i, :4]
+        score = dets[i, -1]
+
+        ax.add_patch(
+            plt.Rectangle((bbox[0], bbox[1]),
+                          bbox[2] - bbox[0],
+                          bbox[3] - bbox[1], fill=False,
+                          edgecolor='red', linewidth=3.5)
+            )
+        ax.text(bbox[0], bbox[1] - 2,
+                '{:s} {:.3f}'.format(class_name, score),
+                bbox=dict(facecolor='blue', alpha=0.5),
+                fontsize=14, color='white')
+
+    ax.set_title(('{} detections with '
+                  'p({} | box) >= {:.1f}').format(class_name, class_name,
+                                                  thresh),
+                  fontsize=14)
+    plt.axis('off')
+    plt.tight_layout()
+    plt.draw()
+
+def demo(net, image_name):
+    """Detect object classes in an image using pre-computed object proposals."""
+
+    # Load the demo image
+    im_file = os.path.join(cfg.DATA_DIR, 'demo', image_name)
+    im = cv2.imread(im_file)
+
+    # Detect all object classes and regress object bounds
+    timer = Timer()
+    timer.tic()
+    scores, boxes = im_detect(net, im)
+    timer.toc()
+    print ('Detection took {:.3f}s for '
+           '{:d} object proposals').format(timer.total_time, boxes.shape[0])
+
+    # Visualize detections for each class
+    CONF_THRESH = 0.8
+    NMS_THRESH = 0.3
+    for cls_ind, cls in enumerate(CLASSES[1:]):
+        cls_ind += 1 # because we skipped background
+        cls_boxes = boxes[:, 4*cls_ind:4*(cls_ind + 1)]
+        cls_scores = scores[:, cls_ind]
+        dets = np.hstack((cls_boxes,
+                          cls_scores[:, np.newaxis])).astype(np.float32)
+        keep = nms(dets, NMS_THRESH)
+        dets = dets[keep, :]
+        vis_detections(im, cls, dets, thresh=CONF_THRESH)
+
+def parse_args():
+    """Parse input arguments."""
+    parser = argparse.ArgumentParser(description='Faster R-CNN demo')
+    parser.add_argument('--net', dest='demo_net', help='Network to use [vgg16]',
+                        choices=NETS.keys(), default='vgg16')
+
+    args = parser.parse_args()
+
+    return args
+
+if __name__ == '__main__':
+    cfg.TEST.HAS_RPN = True  # Use RPN for proposals
+
+    args = parse_args()
+
+    prototxt = os.path.join(cfg.MODELS_DIR, NETS[args.demo_net][0],
+                            'faster_rcnn_alt_opt', 'faster_rcnn_test.pt')
+    caffemodel = os.path.join(cfg.DATA_DIR, 'faster_rcnn_models',
+                              NETS[args.demo_net][1])
+
+    if not os.path.isfile(caffemodel):
+        raise IOError(('{:s} not found.\nDid you run ./data/script/'
+                       'fetch_faster_rcnn_models.sh?').format(caffemodel))
+
+    caffe.set_mode_cpu()
+    net = caffe.Net(prototxt, caffemodel, caffe.TEST)
+
+    print '\n\nLoaded network {:s}'.format(caffemodel)
+
+    # Warmup on a dummy image
+    im = 128 * np.ones((300, 500, 3), dtype=np.uint8)
+    for i in xrange(2):
+        _, _= im_detect(net, im)
+
+    im_names = ['000456.jpg', '000542.jpg', '001150.jpg',
+                '001763.jpg', '004545.jpg']
+    for im_name in im_names:
+        print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+        print 'Demo for data/demo/{}'.format(im_name)
+        demo(net, im_name)
+
+    plt.show()
diff --git a/examples/faster-rcnn/tools/eval_recall.py b/examples/faster-rcnn/tools/eval_recall.py
new file mode 100755
index 000000000..b1a59dc27
--- /dev/null
+++ b/examples/faster-rcnn/tools/eval_recall.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+import _init_paths
+from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list
+from datasets.factory import get_imdb
+import argparse
+import time, os, sys
+import numpy as np
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
+    parser.add_argument('--imdb', dest='imdb_name',
+                        help='dataset to test',
+                        default='voc_2007_test', type=str)
+    parser.add_argument('--method', dest='method',
+                        help='proposal method',
+                        default='selective_search', type=str)
+    parser.add_argument('--rpn-file', dest='rpn_file',
+                        default=None, type=str)
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    print('Called with args:')
+    print(args)
+
+    imdb = get_imdb(args.imdb_name)
+    imdb.set_proposal_method(args.method)
+    if args.rpn_file is not None:
+        imdb.config['rpn_file'] = args.rpn_file
+
+    candidate_boxes = None
+    if 0:
+        import scipy.io as sio
+        filename = 'debug/stage1_rpn_voc_2007_test.mat'
+        raw_data = sio.loadmat(filename)['aboxes'].ravel()
+        candidate_boxes = raw_data
+
+    ar, gt_overlaps, recalls, thresholds = \
+        imdb.evaluate_recall(candidate_boxes=candidate_boxes)
+    print 'Method: {}'.format(args.method)
+    print 'AverageRec: {:.3f}'.format(ar)
+
+    def recall_at(t):
+        ind = np.where(thresholds > t - 1e-5)[0][0]
+        assert np.isclose(thresholds[ind], t)
+        return recalls[ind]
+
+    print 'Recall@0.5: {:.3f}'.format(recall_at(0.5))
+    print 'Recall@0.6: {:.3f}'.format(recall_at(0.6))
+    print 'Recall@0.7: {:.3f}'.format(recall_at(0.7))
+    print 'Recall@0.8: {:.3f}'.format(recall_at(0.8))
+    print 'Recall@0.9: {:.3f}'.format(recall_at(0.9))
+    # print again for easy spreadsheet copying
+    print '{:.3f}'.format(ar)
+    print '{:.3f}'.format(recall_at(0.5))
+    print '{:.3f}'.format(recall_at(0.6))
+    print '{:.3f}'.format(recall_at(0.7))
+    print '{:.3f}'.format(recall_at(0.8))
+    print '{:.3f}'.format(recall_at(0.9))
diff --git a/examples/faster-rcnn/tools/reval.py b/examples/faster-rcnn/tools/reval.py
new file mode 100755
index 000000000..905ec1b14
--- /dev/null
+++ b/examples/faster-rcnn/tools/reval.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Reval = re-eval. Re-evaluate saved detections."""
+
+import _init_paths
+from fast_rcnn.test import apply_nms
+from fast_rcnn.config import cfg
+from datasets.factory import get_imdb
+import cPickle
+import os, sys, argparse
+import numpy as np
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(description='Re-evaluate results')
+    parser.add_argument('output_dir', nargs=1, help='results directory',
+                        type=str)
+    parser.add_argument('--imdb', dest='imdb_name',
+                        help='dataset to re-evaluate',
+                        default='voc_2007_test', type=str)
+    parser.add_argument('--matlab', dest='matlab_eval',
+                        help='use matlab for evaluation',
+                        action='store_true')
+    parser.add_argument('--comp', dest='comp_mode', help='competition mode',
+                        action='store_true')
+    parser.add_argument('--nms', dest='apply_nms', help='apply nms',
+                        action='store_true')
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+def from_dets(imdb_name, output_dir, args):
+    imdb = get_imdb(imdb_name)
+    imdb.competition_mode(args.comp_mode)
+    imdb.config['matlab_eval'] = args.matlab_eval
+    with open(os.path.join(output_dir, 'detections.pkl'), 'rb') as f:
+        dets = cPickle.load(f)
+
+    if args.apply_nms:
+        print 'Applying NMS to all detections'
+        nms_dets = apply_nms(dets, cfg.TEST.NMS)
+    else:
+        nms_dets = dets
+
+    print 'Evaluating detections'
+    imdb.evaluate_detections(nms_dets, output_dir)
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    output_dir = os.path.abspath(args.output_dir[0])
+    imdb_name = args.imdb_name
+    from_dets(imdb_name, output_dir, args)
diff --git a/examples/faster-rcnn/tools/rpn_generate.py b/examples/faster-rcnn/tools/rpn_generate.py
new file mode 100755
index 000000000..f8ca4a167
--- /dev/null
+++ b/examples/faster-rcnn/tools/rpn_generate.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+# --------------------------------------------------------
+# Fast/er/ R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Generate RPN proposals."""
+
+import _init_paths
+import numpy as np
+from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir
+from datasets.factory import get_imdb
+from rpn.generate import imdb_proposals
+import cPickle
+import caffe
+import argparse
+import pprint
+import time, os, sys
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use',
+                        default=0, type=int)
+    parser.add_argument('--def', dest='prototxt',
+                        help='prototxt file defining the network',
+                        default=None, type=str)
+    parser.add_argument('--net', dest='caffemodel',
+                        help='model to test',
+                        default=None, type=str)
+    parser.add_argument('--cfg', dest='cfg_file',
+                        help='optional config file', default=None, type=str)
+    parser.add_argument('--wait', dest='wait',
+                        help='wait until net file exists',
+                        default=True, type=bool)
+    parser.add_argument('--imdb', dest='imdb_name',
+                        help='dataset to test',
+                        default='voc_2007_test', type=str)
+    parser.add_argument('--set', dest='set_cfgs',
+                        help='set config keys', default=None,
+                        nargs=argparse.REMAINDER)
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    print('Called with args:')
+    print(args)
+
+    if args.cfg_file is not None:
+        cfg_from_file(args.cfg_file)
+    if args.set_cfgs is not None:
+        cfg_from_list(args.set_cfgs)
+
+    cfg.GPU_ID = args.gpu_id
+
+    # RPN test settings
+    cfg.TEST.RPN_PRE_NMS_TOP_N = -1
+    cfg.TEST.RPN_POST_NMS_TOP_N = 2000
+
+    print('Using config:')
+    pprint.pprint(cfg)
+
+    while not os.path.exists(args.caffemodel) and args.wait:
+        print('Waiting for {} to exist...'.format(args.caffemodel))
+        time.sleep(10)
+
+    caffe.set_mode_gpu()
+    caffe.set_device(args.gpu_id)
+    net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST)
+    net.name = os.path.splitext(os.path.basename(args.caffemodel))[0]
+
+    imdb = get_imdb(args.imdb_name)
+    imdb_boxes = imdb_proposals(net, imdb)
+
+    output_dir = get_output_dir(imdb, net)
+    rpn_file = os.path.join(output_dir, net.name + '_rpn_proposals.pkl')
+    with open(rpn_file, 'wb') as f:
+        cPickle.dump(imdb_boxes, f, cPickle.HIGHEST_PROTOCOL)
+    print 'Wrote RPN proposals to {}'.format(rpn_file)
diff --git a/examples/faster-rcnn/tools/test_net.py b/examples/faster-rcnn/tools/test_net.py
new file mode 100755
index 000000000..c22db3e1b
--- /dev/null
+++ b/examples/faster-rcnn/tools/test_net.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Test a Fast R-CNN network on an image database."""
+
+import _init_paths
+from fast_rcnn.test import test_net
+from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list
+from datasets.factory import get_imdb
+import caffe
+import argparse
+import pprint
+import time, os, sys
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use',
+                        default=0, type=int)
+    parser.add_argument('--def', dest='prototxt',
+                        help='prototxt file defining the network',
+                        default=None, type=str)
+    parser.add_argument('--net', dest='caffemodel',
+                        help='model to test',
+                        default=None, type=str)
+    parser.add_argument('--cfg', dest='cfg_file',
+                        help='optional config file', default=None, type=str)
+    parser.add_argument('--wait', dest='wait',
+                        help='wait until net file exists',
+                        default=True, type=bool)
+    parser.add_argument('--imdb', dest='imdb_name',
+                        help='dataset to test',
+                        default='voc_2007_test', type=str)
+    parser.add_argument('--comp', dest='comp_mode', help='competition mode',
+                        action='store_true')
+    parser.add_argument('--set', dest='set_cfgs',
+                        help='set config keys', default=None,
+                        nargs=argparse.REMAINDER)
+    parser.add_argument('--vis', dest='vis', help='visualize detections',
+                        action='store_true')
+    parser.add_argument('--num_dets', dest='max_per_image',
+                        help='max number of detections per image',
+                        default=100, type=int)
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    print('Called with args:')
+    print(args)
+
+    if args.cfg_file is not None:
+        cfg_from_file(args.cfg_file)
+    if args.set_cfgs is not None:
+        cfg_from_list(args.set_cfgs)
+
+    cfg.GPU_ID = args.gpu_id
+
+    print('Using config:')
+    pprint.pprint(cfg)
+
+    while not os.path.exists(args.caffemodel) and args.wait:
+        print('Waiting for {} to exist...'.format(args.caffemodel))
+        time.sleep(10)
+
+    #caffe.set_mode_gpu()
+    #caffe.set_device(args.gpu_id)
+    net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST)
+    net.name = os.path.splitext(os.path.basename(args.caffemodel))[0]
+
+    imdb = get_imdb(args.imdb_name)
+    imdb.competition_mode(args.comp_mode)
+    if not cfg.TEST.HAS_RPN:
+        imdb.set_proposal_method(cfg.TEST.PROPOSAL_METHOD)
+
+    test_net(net, imdb, max_per_image=args.max_per_image, vis=args.vis)
diff --git a/examples/faster-rcnn/tools/train_faster_rcnn_alt_opt.py b/examples/faster-rcnn/tools/train_faster_rcnn_alt_opt.py
new file mode 100755
index 000000000..e49844a45
--- /dev/null
+++ b/examples/faster-rcnn/tools/train_faster_rcnn_alt_opt.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python
+
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Train a Faster R-CNN network using alternating optimization.
+This tool implements the alternating optimization algorithm described in our
+NIPS 2015 paper ("Faster R-CNN: Towards Real-time Object Detection with Region
+Proposal Networks." Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun.)
+"""
+
+import _init_paths
+from fast_rcnn.train import get_training_roidb, train_net
+from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir
+from datasets.factory import get_imdb
+from rpn.generate import imdb_proposals
+import argparse
+import pprint
+import numpy as np
+import sys, os
+import multiprocessing as mp
+import cPickle
+import shutil
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(description='Train a Faster R-CNN network')
+    parser.add_argument('--gpu', dest='gpu_id',
+                        help='GPU device id to use [0]',
+                        default=0, type=int)
+    parser.add_argument('--net_name', dest='net_name',
+                        help='network name (e.g., "ZF")',
+                        default=None, type=str)
+    parser.add_argument('--weights', dest='pretrained_model',
+                        help='initialize with pretrained model weights',
+                        default=None, type=str)
+    parser.add_argument('--cfg', dest='cfg_file',
+                        help='optional config file',
+                        default=None, type=str)
+    parser.add_argument('--imdb', dest='imdb_name',
+                        help='dataset to train on',
+                        default='voc_2007_trainval', type=str)
+    parser.add_argument('--set', dest='set_cfgs',
+                        help='set config keys', default=None,
+                        nargs=argparse.REMAINDER)
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+def get_roidb(imdb_name, rpn_file=None):
+    imdb = get_imdb(imdb_name)
+    print 'Loaded dataset `{:s}` for training'.format(imdb.name)
+    imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
+    print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
+    if rpn_file is not None:
+        imdb.config['rpn_file'] = rpn_file
+    roidb = get_training_roidb(imdb)
+    return roidb, imdb
+
+def get_solvers(net_name):
+    # Faster R-CNN Alternating Optimization
+    n = 'faster_rcnn_alt_opt'
+    # Solver for each training stage
+    solvers = [[net_name, n, 'stage1_rpn_solver60k80k.pt'],
+               [net_name, n, 'stage1_fast_rcnn_solver30k40k.pt'],
+               [net_name, n, 'stage2_rpn_solver60k80k.pt'],
+               [net_name, n, 'stage2_fast_rcnn_solver30k40k.pt']]
+    solvers = [os.path.join(cfg.MODELS_DIR, *s) for s in solvers]
+    # Iterations for each training stage
+    max_iters = [80000, 40000, 80000, 40000]
+    # max_iters = [100, 100, 100, 100]
+    # Test prototxt for the RPN
+    rpn_test_prototxt = os.path.join(
+        cfg.MODELS_DIR, net_name, n, 'rpn_test.pt')
+    return solvers, max_iters, rpn_test_prototxt
+
+# ------------------------------------------------------------------------------
+# Pycaffe doesn't reliably free GPU memory when instantiated nets are discarded
+# (e.g. "del net" in Python code). To work around this issue, each training
+# stage is executed in a separate process using multiprocessing.Process.
+# ------------------------------------------------------------------------------
+
+def _init_caffe(cfg):
+    """Initialize pycaffe in a training process.
+    """
+
+    import caffe
+    # fix the random seeds (numpy and caffe) for reproducibility
+    np.random.seed(cfg.RNG_SEED)
+    caffe.set_random_seed(cfg.RNG_SEED)
+    # set up caffe
+    caffe.set_mode_gpu()
+    caffe.set_device(cfg.GPU_ID)
+
+def train_rpn(queue=None, imdb_name=None, init_model=None, solver=None,
+              max_iters=None, cfg=None):
+    """Train a Region Proposal Network in a separate training process.
+    """
+
+    # Not using any proposals, just ground-truth boxes
+    cfg.TRAIN.HAS_RPN = True
+    cfg.TRAIN.BBOX_REG = False  # applies only to Fast R-CNN bbox regression
+    cfg.TRAIN.PROPOSAL_METHOD = 'gt'
+    cfg.TRAIN.IMS_PER_BATCH = 1
+    print 'Init model: {}'.format(init_model)
+    print('Using config:')
+    pprint.pprint(cfg)
+
+    import caffe
+    _init_caffe(cfg)
+
+    roidb, imdb = get_roidb(imdb_name)
+    print 'roidb len: {}'.format(len(roidb))
+    output_dir = get_output_dir(imdb)
+    print 'Output will be saved to `{:s}`'.format(output_dir)
+
+    model_paths = train_net(solver, roidb, output_dir,
+                            pretrained_model=init_model,
+                            max_iters=max_iters)
+    # Cleanup all but the final model
+    for i in model_paths[:-1]:
+        os.remove(i)
+    rpn_model_path = model_paths[-1]
+    # Send final model path through the multiprocessing queue
+    queue.put({'model_path': rpn_model_path})
+
+def rpn_generate(queue=None, imdb_name=None, rpn_model_path=None, cfg=None,
+                 rpn_test_prototxt=None):
+    """Use a trained RPN to generate proposals.
+    """
+
+    cfg.TEST.RPN_PRE_NMS_TOP_N = -1     # no pre NMS filtering
+    cfg.TEST.RPN_POST_NMS_TOP_N = 2000  # limit top boxes after NMS
+    print 'RPN model: {}'.format(rpn_model_path)
+    print('Using config:')
+    pprint.pprint(cfg)
+
+    import caffe
+    _init_caffe(cfg)
+
+    # NOTE: the matlab implementation computes proposals on flipped images, too.
+    # We compute them on the image once and then flip the already computed
+    # proposals. This might cause a minor loss in mAP (less proposal jittering).
+    imdb = get_imdb(imdb_name)
+    print 'Loaded dataset `{:s}` for proposal generation'.format(imdb.name)
+
+    # Load RPN and configure output directory
+    rpn_net = caffe.Net(rpn_test_prototxt, rpn_model_path, caffe.TEST)
+    output_dir = get_output_dir(imdb)
+    print 'Output will be saved to `{:s}`'.format(output_dir)
+    # Generate proposals on the imdb
+    rpn_proposals = imdb_proposals(rpn_net, imdb)
+    # Write proposals to disk and send the proposal file path through the
+    # multiprocessing queue
+    rpn_net_name = os.path.splitext(os.path.basename(rpn_model_path))[0]
+    rpn_proposals_path = os.path.join(
+        output_dir, rpn_net_name + '_proposals.pkl')
+    with open(rpn_proposals_path, 'wb') as f:
+        cPickle.dump(rpn_proposals, f, cPickle.HIGHEST_PROTOCOL)
+    print 'Wrote RPN proposals to {}'.format(rpn_proposals_path)
+    queue.put({'proposal_path': rpn_proposals_path})
+
+def train_fast_rcnn(queue=None, imdb_name=None, init_model=None, solver=None,
+                    max_iters=None, cfg=None, rpn_file=None):
+    """Train a Fast R-CNN using proposals generated by an RPN.
+    """
+
+    cfg.TRAIN.HAS_RPN = False           # not generating prosals on-the-fly
+    cfg.TRAIN.PROPOSAL_METHOD = 'rpn'   # use pre-computed RPN proposals instead
+    cfg.TRAIN.IMS_PER_BATCH = 2
+    print 'Init model: {}'.format(init_model)
+    print 'RPN proposals: {}'.format(rpn_file)
+    print('Using config:')
+    pprint.pprint(cfg)
+
+    import caffe
+    _init_caffe(cfg)
+
+    roidb, imdb = get_roidb(imdb_name, rpn_file=rpn_file)
+    output_dir = get_output_dir(imdb)
+    print 'Output will be saved to `{:s}`'.format(output_dir)
+    # Train Fast R-CNN
+    model_paths = train_net(solver, roidb, output_dir,
+                            pretrained_model=init_model,
+                            max_iters=max_iters)
+    # Cleanup all but the final model
+    for i in model_paths[:-1]:
+        os.remove(i)
+    fast_rcnn_model_path = model_paths[-1]
+    # Send Fast R-CNN model path over the multiprocessing queue
+    queue.put({'model_path': fast_rcnn_model_path})
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    print('Called with args:')
+    print(args)
+
+    if args.cfg_file is not None:
+        cfg_from_file(args.cfg_file)
+    if args.set_cfgs is not None:
+        cfg_from_list(args.set_cfgs)
+    cfg.GPU_ID = args.gpu_id
+
+    # --------------------------------------------------------------------------
+    # Pycaffe doesn't reliably free GPU memory when instantiated nets are
+    # discarded (e.g. "del net" in Python code). To work around this issue, each
+    # training stage is executed in a separate process using
+    # multiprocessing.Process.
+    # --------------------------------------------------------------------------
+
+    # queue for communicated results between processes
+    mp_queue = mp.Queue()
+    # solves, iters, etc. for each training stage
+    solvers, max_iters, rpn_test_prototxt = get_solvers(args.net_name)
+
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+    print 'Stage 1 RPN, init from ImageNet model'
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+
+    cfg.TRAIN.SNAPSHOT_INFIX = 'stage1'
+    mp_kwargs = dict(
+            queue=mp_queue,
+            imdb_name=args.imdb_name,
+            init_model=args.pretrained_model,
+            solver=solvers[0],
+            max_iters=max_iters[0],
+            cfg=cfg)
+    p = mp.Process(target=train_rpn, kwargs=mp_kwargs)
+    p.start()
+    rpn_stage1_out = mp_queue.get()
+    p.join()
+
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+    print 'Stage 1 RPN, generate proposals'
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+
+    mp_kwargs = dict(
+            queue=mp_queue,
+            imdb_name=args.imdb_name,
+            rpn_model_path=str(rpn_stage1_out['model_path']),
+            cfg=cfg,
+            rpn_test_prototxt=rpn_test_prototxt)
+    p = mp.Process(target=rpn_generate, kwargs=mp_kwargs)
+    p.start()
+    rpn_stage1_out['proposal_path'] = mp_queue.get()['proposal_path']
+    p.join()
+
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+    print 'Stage 1 Fast R-CNN using RPN proposals, init from ImageNet model'
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+
+    cfg.TRAIN.SNAPSHOT_INFIX = 'stage1'
+    mp_kwargs = dict(
+            queue=mp_queue,
+            imdb_name=args.imdb_name,
+            init_model=args.pretrained_model,
+            solver=solvers[1],
+            max_iters=max_iters[1],
+            cfg=cfg,
+            rpn_file=rpn_stage1_out['proposal_path'])
+    p = mp.Process(target=train_fast_rcnn, kwargs=mp_kwargs)
+    p.start()
+    fast_rcnn_stage1_out = mp_queue.get()
+    p.join()
+
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+    print 'Stage 2 RPN, init from stage 1 Fast R-CNN model'
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+
+    cfg.TRAIN.SNAPSHOT_INFIX = 'stage2'
+    mp_kwargs = dict(
+            queue=mp_queue,
+            imdb_name=args.imdb_name,
+            init_model=str(fast_rcnn_stage1_out['model_path']),
+            solver=solvers[2],
+            max_iters=max_iters[2],
+            cfg=cfg)
+    p = mp.Process(target=train_rpn, kwargs=mp_kwargs)
+    p.start()
+    rpn_stage2_out = mp_queue.get()
+    p.join()
+
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+    print 'Stage 2 RPN, generate proposals'
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+
+    mp_kwargs = dict(
+            queue=mp_queue,
+            imdb_name=args.imdb_name,
+            rpn_model_path=str(rpn_stage2_out['model_path']),
+            cfg=cfg,
+            rpn_test_prototxt=rpn_test_prototxt)
+    p = mp.Process(target=rpn_generate, kwargs=mp_kwargs)
+    p.start()
+    rpn_stage2_out['proposal_path'] = mp_queue.get()['proposal_path']
+    p.join()
+
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+    print 'Stage 2 Fast R-CNN, init from stage 2 RPN R-CNN model'
+    print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
+
+    cfg.TRAIN.SNAPSHOT_INFIX = 'stage2'
+    mp_kwargs = dict(
+            queue=mp_queue,
+            imdb_name=args.imdb_name,
+            init_model=str(rpn_stage2_out['model_path']),
+            solver=solvers[3],
+            max_iters=max_iters[3],
+            cfg=cfg,
+            rpn_file=rpn_stage2_out['proposal_path'])
+    p = mp.Process(target=train_fast_rcnn, kwargs=mp_kwargs)
+    p.start()
+    fast_rcnn_stage2_out = mp_queue.get()
+    p.join()
+
+    # Create final model (just a copy of the last stage)
+    final_path = os.path.join(
+            os.path.dirname(fast_rcnn_stage2_out['model_path']),
+            args.net_name + '_faster_rcnn_final.caffemodel')
+    print 'cp {} -> {}'.format(
+            fast_rcnn_stage2_out['model_path'], final_path)
+    shutil.copy(fast_rcnn_stage2_out['model_path'], final_path)
+    print 'Final model: {}'.format(final_path)
diff --git a/examples/faster-rcnn/tools/train_net.py b/examples/faster-rcnn/tools/train_net.py
new file mode 100755
index 000000000..3c6e6fbb6
--- /dev/null
+++ b/examples/faster-rcnn/tools/train_net.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Train a Fast R-CNN network on a region of interest database."""
+
+import _init_paths
+from fast_rcnn.train import get_training_roidb, train_net
+from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir
+from datasets.factory import get_imdb
+import datasets.imdb
+import caffe
+import argparse
+import pprint
+import numpy as np
+import sys
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(description='Train a Fast R-CNN network')
+    parser.add_argument('--gpu', dest='gpu_id',
+                        help='GPU device id to use [0]',
+                        default=0, type=int)
+    parser.add_argument('--solver', dest='solver',
+                        help='solver prototxt',
+                        default=None, type=str)
+    parser.add_argument('--iters', dest='max_iters',
+                        help='number of iterations to train',
+                        default=40000, type=int)
+    parser.add_argument('--weights', dest='pretrained_model',
+                        help='initialize with pretrained model weights',
+                        default=None, type=str)
+    parser.add_argument('--cfg', dest='cfg_file',
+                        help='optional config file',
+                        default=None, type=str)
+    parser.add_argument('--imdb', dest='imdb_name',
+                        help='dataset to train on',
+                        default='voc_2007_trainval', type=str)
+    parser.add_argument('--rand', dest='randomize',
+                        help='randomize (do not use a fixed seed)',
+                        action='store_true')
+    parser.add_argument('--set', dest='set_cfgs',
+                        help='set config keys', default=None,
+                        nargs=argparse.REMAINDER)
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+def combined_roidb(imdb_names):
+    def get_roidb(imdb_name):
+        imdb = get_imdb(imdb_name)
+        print 'Loaded dataset `{:s}` for training'.format(imdb.name)
+        imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
+        print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
+        roidb = get_training_roidb(imdb)
+        return roidb
+
+    roidbs = [get_roidb(s) for s in imdb_names.split('+')]
+    roidb = roidbs[0]
+    if len(roidbs) > 1:
+        for r in roidbs[1:]:
+            roidb.extend(r)
+        imdb = datasets.imdb.imdb(imdb_names)
+    else:
+        imdb = get_imdb(imdb_names)
+    return imdb, roidb
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    print('Called with args:')
+    print(args)
+
+    if args.cfg_file is not None:
+        cfg_from_file(args.cfg_file)
+    if args.set_cfgs is not None:
+        cfg_from_list(args.set_cfgs)
+
+    cfg.GPU_ID = args.gpu_id
+
+    print('Using config:')
+    pprint.pprint(cfg)
+
+    if not args.randomize:
+        # fix the random seeds (numpy and caffe) for reproducibility
+        np.random.seed(cfg.RNG_SEED)
+        caffe.set_random_seed(cfg.RNG_SEED)
+
+    # set up caffe
+    #caffe.set_mode_gpu()
+    #caffe.set_device(args.gpu_id)
+
+    imdb, roidb = combined_roidb(args.imdb_name)
+    print '{:d} roidb entries'.format(len(roidb))
+
+    output_dir = get_output_dir(imdb)
+    print 'Output will be saved to `{:s}`'.format(output_dir)
+
+    train_net(args.solver, roidb, output_dir,
+              pretrained_model=args.pretrained_model,
+              max_iters=args.max_iters)
diff --git a/examples/faster-rcnn/tools/train_svms.py b/examples/faster-rcnn/tools/train_svms.py
new file mode 100755
index 000000000..498bbf2a2
--- /dev/null
+++ b/examples/faster-rcnn/tools/train_svms.py
@@ -0,0 +1,353 @@
+#!/usr/bin/env python
+
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""
+Train post-hoc SVMs using the algorithm and hyper-parameters from
+traditional R-CNN.
+"""
+
+import _init_paths
+from fast_rcnn.config import cfg, cfg_from_file
+from datasets.factory import get_imdb
+from fast_rcnn.test import im_detect
+from utils.timer import Timer
+import caffe
+import argparse
+import pprint
+import numpy as np
+import numpy.random as npr
+import cv2
+from sklearn import svm
+import os, sys
+
+class SVMTrainer(object):
+    """
+    Trains post-hoc detection SVMs for all classes using the algorithm
+    and hyper-parameters of traditional R-CNN.
+    """
+
+    def __init__(self, net, imdb):
+        self.imdb = imdb
+        self.net = net
+        self.layer = 'fc7'
+        self.hard_thresh = -1.0001
+        self.neg_iou_thresh = 0.3
+
+        dim = net.params['cls_score'][0].data.shape[1]
+        scale = self._get_feature_scale()
+        print('Feature dim: {}'.format(dim))
+        print('Feature scale: {:.3f}'.format(scale))
+        self.trainers = [SVMClassTrainer(cls, dim, feature_scale=scale)
+                         for cls in imdb.classes]
+
+    def _get_feature_scale(self, num_images=100):
+        TARGET_NORM = 20.0 # Magic value from traditional R-CNN
+        _t = Timer()
+        roidb = self.imdb.roidb
+        total_norm = 0.0
+        count = 0.0
+        inds = npr.choice(xrange(self.imdb.num_images), size=num_images,
+                          replace=False)
+        for i_, i in enumerate(inds):
+            im = cv2.imread(self.imdb.image_path_at(i))
+            if roidb[i]['flipped']:
+                im = im[:, ::-1, :]
+            _t.tic()
+            scores, boxes = im_detect(self.net, im, roidb[i]['boxes'])
+            _t.toc()
+            feat = self.net.blobs[self.layer].data
+            total_norm += np.sqrt((feat ** 2).sum(axis=1)).sum()
+            count += feat.shape[0]
+            print('{}/{}: avg feature norm: {:.3f}'.format(i_ + 1, num_images,
+                                                           total_norm / count))
+
+        return TARGET_NORM * 1.0 / (total_norm / count)
+
+    def _get_pos_counts(self):
+        counts = np.zeros((len(self.imdb.classes)), dtype=np.int)
+        roidb = self.imdb.roidb
+        for i in xrange(len(roidb)):
+            for j in xrange(1, self.imdb.num_classes):
+                I = np.where(roidb[i]['gt_classes'] == j)[0]
+                counts[j] += len(I)
+
+        for j in xrange(1, self.imdb.num_classes):
+            print('class {:s} has {:d} positives'.
+                  format(self.imdb.classes[j], counts[j]))
+
+        return counts
+
+    def get_pos_examples(self):
+        counts = self._get_pos_counts()
+        for i in xrange(len(counts)):
+            self.trainers[i].alloc_pos(counts[i])
+
+        _t = Timer()
+        roidb = self.imdb.roidb
+        num_images = len(roidb)
+        # num_images = 100
+        for i in xrange(num_images):
+            im = cv2.imread(self.imdb.image_path_at(i))
+            if roidb[i]['flipped']:
+                im = im[:, ::-1, :]
+            gt_inds = np.where(roidb[i]['gt_classes'] > 0)[0]
+            gt_boxes = roidb[i]['boxes'][gt_inds]
+            _t.tic()
+            scores, boxes = im_detect(self.net, im, gt_boxes)
+            _t.toc()
+            feat = self.net.blobs[self.layer].data
+            for j in xrange(1, self.imdb.num_classes):
+                cls_inds = np.where(roidb[i]['gt_classes'][gt_inds] == j)[0]
+                if len(cls_inds) > 0:
+                    cls_feat = feat[cls_inds, :]
+                    self.trainers[j].append_pos(cls_feat)
+
+            print 'get_pos_examples: {:d}/{:d} {:.3f}s' \
+                  .format(i + 1, len(roidb), _t.average_time)
+
+    def initialize_net(self):
+        # Start all SVM parameters at zero
+        self.net.params['cls_score'][0].data[...] = 0
+        self.net.params['cls_score'][1].data[...] = 0
+
+        # Initialize SVMs in a smart way. Not doing this because its such
+        # a good initialization that we might not learn something close to
+        # the SVM solution.
+#        # subtract background weights and biases for the foreground classes
+#        w_bg = self.net.params['cls_score'][0].data[0, :]
+#        b_bg = self.net.params['cls_score'][1].data[0]
+#        self.net.params['cls_score'][0].data[1:, :] -= w_bg
+#        self.net.params['cls_score'][1].data[1:] -= b_bg
+#        # set the background weights and biases to 0 (where they shall remain)
+#        self.net.params['cls_score'][0].data[0, :] = 0
+#        self.net.params['cls_score'][1].data[0] = 0
+
+    def update_net(self, cls_ind, w, b):
+        self.net.params['cls_score'][0].data[cls_ind, :] = w
+        self.net.params['cls_score'][1].data[cls_ind] = b
+
+    def train_with_hard_negatives(self):
+        _t = Timer()
+        roidb = self.imdb.roidb
+        num_images = len(roidb)
+        # num_images = 100
+        for i in xrange(num_images):
+            im = cv2.imread(self.imdb.image_path_at(i))
+            if roidb[i]['flipped']:
+                im = im[:, ::-1, :]
+            _t.tic()
+            scores, boxes = im_detect(self.net, im, roidb[i]['boxes'])
+            _t.toc()
+            feat = self.net.blobs[self.layer].data
+            for j in xrange(1, self.imdb.num_classes):
+                hard_inds = \
+                    np.where((scores[:, j] > self.hard_thresh) &
+                             (roidb[i]['gt_overlaps'][:, j].toarray().ravel() <
+                              self.neg_iou_thresh))[0]
+                if len(hard_inds) > 0:
+                    hard_feat = feat[hard_inds, :].copy()
+                    new_w_b = \
+                        self.trainers[j].append_neg_and_retrain(feat=hard_feat)
+                    if new_w_b is not None:
+                        self.update_net(j, new_w_b[0], new_w_b[1])
+
+            print(('train_with_hard_negatives: '
+                   '{:d}/{:d} {:.3f}s').format(i + 1, len(roidb),
+                                               _t.average_time))
+
+    def train(self):
+        # Initialize SVMs using
+        #   a. w_i = fc8_w_i - fc8_w_0
+        #   b. b_i = fc8_b_i - fc8_b_0
+        #   c. Install SVMs into net
+        self.initialize_net()
+
+        # Pass over roidb to count num positives for each class
+        #   a. Pre-allocate arrays for positive feature vectors
+        # Pass over roidb, computing features for positives only
+        self.get_pos_examples()
+
+        # Pass over roidb
+        #   a. Compute cls_score with forward pass
+        #   b. For each class
+        #       i. Select hard negatives
+        #       ii. Add them to cache
+        #   c. For each class
+        #       i. If SVM retrain criteria met, update SVM
+        #       ii. Install new SVM into net
+        self.train_with_hard_negatives()
+
+        # One final SVM retraining for each class
+        # Install SVMs into net
+        for j in xrange(1, self.imdb.num_classes):
+            new_w_b = self.trainers[j].append_neg_and_retrain(force=True)
+            self.update_net(j, new_w_b[0], new_w_b[1])
+
+class SVMClassTrainer(object):
+    """Manages post-hoc SVM training for a single object class."""
+
+    def __init__(self, cls, dim, feature_scale=1.0,
+                 C=0.001, B=10.0, pos_weight=2.0):
+        self.pos = np.zeros((0, dim), dtype=np.float32)
+        self.neg = np.zeros((0, dim), dtype=np.float32)
+        self.B = B
+        self.C = C
+        self.cls = cls
+        self.pos_weight = pos_weight
+        self.dim = dim
+        self.feature_scale = feature_scale
+        self.svm = svm.LinearSVC(C=C, class_weight={1: 2, -1: 1},
+                                 intercept_scaling=B, verbose=1,
+                                 penalty='l2', loss='l1',
+                                 random_state=cfg.RNG_SEED, dual=True)
+        self.pos_cur = 0
+        self.num_neg_added = 0
+        self.retrain_limit = 2000
+        self.evict_thresh = -1.1
+        self.loss_history = []
+
+    def alloc_pos(self, count):
+        self.pos_cur = 0
+        self.pos = np.zeros((count, self.dim), dtype=np.float32)
+
+    def append_pos(self, feat):
+        num = feat.shape[0]
+        self.pos[self.pos_cur:self.pos_cur + num, :] = feat
+        self.pos_cur += num
+
+    def train(self):
+        print('>>> Updating {} detector <<<'.format(self.cls))
+        num_pos = self.pos.shape[0]
+        num_neg = self.neg.shape[0]
+        print('Cache holds {} pos examples and {} neg examples'.
+              format(num_pos, num_neg))
+        X = np.vstack((self.pos, self.neg)) * self.feature_scale
+        y = np.hstack((np.ones(num_pos),
+                       -np.ones(num_neg)))
+        self.svm.fit(X, y)
+        w = self.svm.coef_
+        b = self.svm.intercept_[0]
+        scores = self.svm.decision_function(X)
+        pos_scores = scores[:num_pos]
+        neg_scores = scores[num_pos:]
+
+        pos_loss = (self.C * self.pos_weight *
+                    np.maximum(0, 1 - pos_scores).sum())
+        neg_loss = self.C * np.maximum(0, 1 + neg_scores).sum()
+        reg_loss = 0.5 * np.dot(w.ravel(), w.ravel()) + 0.5 * b ** 2
+        tot_loss = pos_loss + neg_loss + reg_loss
+        self.loss_history.append((tot_loss, pos_loss, neg_loss, reg_loss))
+
+        for i, losses in enumerate(self.loss_history):
+            print(('    {:d}: obj val: {:.3f} = {:.3f} '
+                   '(pos) + {:.3f} (neg) + {:.3f} (reg)').format(i, *losses))
+
+        # Sanity check
+        scores_ret = (
+                X * 1.0 / self.feature_scale).dot(w.T * self.feature_scale) + b
+        assert np.allclose(scores, scores_ret[:, 0], atol=1e-5), \
+                "Scores from returned model don't match decision function"
+
+        return ((w * self.feature_scale, b), pos_scores, neg_scores)
+
+    def append_neg_and_retrain(self, feat=None, force=False):
+        if feat is not None:
+            num = feat.shape[0]
+            self.neg = np.vstack((self.neg, feat))
+            self.num_neg_added += num
+        if self.num_neg_added > self.retrain_limit or force:
+            self.num_neg_added = 0
+            new_w_b, pos_scores, neg_scores = self.train()
+            # scores = np.dot(self.neg, new_w_b[0].T) + new_w_b[1]
+            # easy_inds = np.where(neg_scores < self.evict_thresh)[0]
+            not_easy_inds = np.where(neg_scores >= self.evict_thresh)[0]
+            if len(not_easy_inds) > 0:
+                self.neg = self.neg[not_easy_inds, :]
+                # self.neg = np.delete(self.neg, easy_inds)
+            print('    Pruning easy negatives')
+            print('    Cache holds {} pos examples and {} neg examples'.
+                  format(self.pos.shape[0], self.neg.shape[0]))
+            print('    {} pos support vectors'.format((pos_scores <= 1).sum()))
+            print('    {} neg support vectors'.format((neg_scores >= -1).sum()))
+            return new_w_b
+        else:
+            return None
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    parser = argparse.ArgumentParser(description='Train SVMs (old skool)')
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
+                        default=0, type=int)
+    parser.add_argument('--def', dest='prototxt',
+                        help='prototxt file defining the network',
+                        default=None, type=str)
+    parser.add_argument('--net', dest='caffemodel',
+                        help='model to test',
+                        default=None, type=str)
+    parser.add_argument('--cfg', dest='cfg_file',
+                        help='optional config file', default=None, type=str)
+    parser.add_argument('--imdb', dest='imdb_name',
+                        help='dataset to train on',
+                        default='voc_2007_trainval', type=str)
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    # Must turn this off to prevent issues when digging into the net blobs to
+    # pull out features (tricky!)
+    cfg.DEDUP_BOXES = 0
+
+    # Must turn this on because we use the test im_detect() method to harvest
+    # hard negatives
+    cfg.TEST.SVM = True
+
+    args = parse_args()
+
+    print('Called with args:')
+    print(args)
+
+    if args.cfg_file is not None:
+        cfg_from_file(args.cfg_file)
+
+    print('Using config:')
+    pprint.pprint(cfg)
+
+    # fix the random seed for reproducibility
+    np.random.seed(cfg.RNG_SEED)
+
+    # set up caffe
+    caffe.set_mode_gpu()
+    if args.gpu_id is not None:
+        caffe.set_device(args.gpu_id)
+    net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST)
+    net.name = os.path.splitext(os.path.basename(args.caffemodel))[0]
+    out = os.path.splitext(os.path.basename(args.caffemodel))[0] + '_svm'
+    out_dir = os.path.dirname(args.caffemodel)
+
+    imdb = get_imdb(args.imdb_name)
+    print 'Loaded dataset `{:s}` for training'.format(imdb.name)
+
+    # enhance roidb to contain flipped examples
+    if cfg.TRAIN.USE_FLIPPED:
+        print 'Appending horizontally-flipped training examples...'
+        imdb.append_flipped_images()
+        print 'done'
+
+    SVMTrainer(net, imdb).train()
+
+    filename = '{}/{}.caffemodel'.format(out_dir, out)
+    net.save(filename)
+    print 'Wrote svm model to: {:s}'.format(filename)
diff --git a/external/mkl/prepare_mkl.sh b/external/mkl/prepare_mkl.sh
index 09284dc41..362858b8b 100755
--- a/external/mkl/prepare_mkl.sh
+++ b/external/mkl/prepare_mkl.sh
@@ -74,10 +74,10 @@ echo $VERSION_LINE  # Return Version Line
 # MKL
 DST=`dirname $0`
 OMP=0 
-VERSION_MATCH=20170720
-ARCHIVE_BASENAME=mklml_lnx_2018.0.20170720.tgz
+VERSION_MATCH=20170908
+ARCHIVE_BASENAME=mklml_lnx_2018.0.20170908.tgz
 MKL_CONTENT_DIR=`echo $ARCHIVE_BASENAME | rev | cut -d "." -f 2- | rev`
-GITHUB_RELEASE_TAG=1.0.2
+GITHUB_RELEASE_TAG=1.0.3
 
 MKLURL="https://github.com/intel/caffe/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME"
 # there are diffrent MKL lib to be used for GCC and for ICC
diff --git a/include/COPYRIGHT_MPICH b/include/COPYRIGHT_MPICH
new file mode 100644
index 000000000..a9216d495
--- /dev/null
+++ b/include/COPYRIGHT_MPICH
@@ -0,0 +1,39 @@
+
+				  COPYRIGHT
+
+The following is a notice of limited availability of the code, and disclaimer
+which must be included in the prologue of the code and in all source listings
+of the code.
+
+Copyright Notice
+ + 2002 University of Chicago
+
+Permission is hereby granted to use, reproduce, prepare derivative works, and
+to redistribute to others.  This software was authored by:
+
+Mathematics and Computer Science Division
+Argonne National Laboratory, Argonne IL 60439
+
+(and)
+
+Department of Computer Science
+University of Illinois at Urbana-Champaign
+
+
+			      GOVERNMENT LICENSE
+
+Portions of this material resulted from work developed under a U.S.
+Government Contract and are subject to the following license: the Government
+is granted for itself and others acting on its behalf a paid-up, nonexclusive,
+irrevocable worldwide license in this computer software to reproduce, prepare
+derivative works, and perform publicly and display publicly.
+
+				  DISCLAIMER
+
+This computer code material was prepared, in part, as an account of work
+sponsored by an agency of the United States Government.  Neither the United
+States, nor the University of Chicago, nor any of their employees, makes any
+warranty express or implied, or assumes any legal liability or responsibility
+for the accuracy, completeness, or usefulness of any information, apparatus,
+product, or process disclosed, or represents that its use would not infringe
+privately owned rights.
diff --git a/include/caffe/data_reader.hpp b/include/caffe/data_reader.hpp
index dff77199f..a4daa3cfd 100644
--- a/include/caffe/data_reader.hpp
+++ b/include/caffe/data_reader.hpp
@@ -145,7 +145,7 @@ class DataReader {
   const shared_ptr<QueuePair> queue_pair_;
   shared_ptr<Body> body_;
 
-  static map<const string, boost::weak_ptr<DataReader::Body> > bodies_;
+  map<const string, boost::weak_ptr<DataReader::Body> > bodies_;
 
 DISABLE_COPY_AND_ASSIGN(DataReader);
 };
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index 76d5afeb3..a2b5fb2c5 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -143,7 +143,7 @@ template <typename Dtype> class PositiveUnitballFiller : public Filler<Dtype> {
     caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
     // We expect the filler to not be called very frequently, so we will
     // just use a simple implementation
-    int dim = blob->count() / blob->num();
+    int dim = blob->count() / blob->shape(0);
     CHECK(dim);
     for (int i = 0; i < blob->num(); ++i) {
       Dtype sum = 0;
@@ -180,8 +180,8 @@ template <typename Dtype> class XavierFiller : public Filler<Dtype> {
   explicit XavierFiller(const FillerParameter &param) : Filler<Dtype>(param) {}
   virtual void Fill(Blob<Dtype> *blob) {
     CHECK(blob->count());
-    int fan_in = blob->count() / blob->num();
-    int fan_out = blob->count() / blob->channels();
+    int fan_in = blob->count() / blob->shape(0);
+    int fan_out = blob->count() / blob->shape(1);
     Dtype n = fan_in;  // default to fan_in
     if (this->filler_param_.variance_norm() ==
         FillerParameter_VarianceNorm_AVERAGE) {
@@ -220,8 +220,8 @@ template <typename Dtype> class MSRAFiller : public Filler<Dtype> {
   explicit MSRAFiller(const FillerParameter &param) : Filler<Dtype>(param) {}
   virtual void Fill(Blob<Dtype> *blob) {
     CHECK(blob->count());
-    int fan_in = blob->count() / blob->num();
-    int fan_out = blob->count() / blob->channels();
+    int fan_in = blob->count() / blob->shape(0);
+    int fan_out = blob->count() / blob->shape(1);
     Dtype n = fan_in;  // default to fan_in
     if (this->filler_param_.variance_norm() ==
         FillerParameter_VarianceNorm_AVERAGE) {
diff --git a/include/caffe/layers/box_annotator_ohem_layer.hpp b/include/caffe/layers/box_annotator_ohem_layer.hpp
new file mode 100644
index 000000000..9880ff732
--- /dev/null
+++ b/include/caffe/layers/box_annotator_ohem_layer.hpp
@@ -0,0 +1,96 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+
+#ifndef CAFFE_BOX_ANNOTATOR_OHEM_LAYER_HPP_
+#define CAFFE_BOX_ANNOTATOR_OHEM_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+
+namespace caffe {
+
+ /**
+ * @brief BoxAnnotatorOHEMLayer: Annotate box labels for Online Hard Example Mining (OHEM) training
+ * R-FCN
+ * Written by Yi Li
+ */
+  template <typename Dtype>
+  class BoxAnnotatorOHEMLayer :public Layer<Dtype>{
+   public:
+    explicit BoxAnnotatorOHEMLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const { return "BoxAnnotatorOHEM"; }
+
+    virtual inline int ExactNumBottomBlobs() const { return 4; }
+    virtual inline int ExactNumTopBlobs() const { return 2; }
+
+   protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    int num_;
+    int height_;
+    int width_;
+    int spatial_dim_;
+    int bbox_channels_;
+
+    int roi_per_img_;
+    int ignore_label_;
+  };
+
+}  // namespace caffe
+
+#endif  // CAFFE_BOX_ANNOTATOR_OHEM_LAYER_HPP_
diff --git a/include/caffe/layers/mkl_layers.hpp b/include/caffe/layers/mkl_layers.hpp
index c9806daee..32d233999 100644
--- a/include/caffe/layers/mkl_layers.hpp
+++ b/include/caffe/layers/mkl_layers.hpp
@@ -88,6 +88,11 @@ class MKLConvolutionLayer : public ConvolutionLayer<Dtype> {
   void Reshape(const vector<Blob<Dtype>*>& bottom,
           const vector<Blob<Dtype>*>& top);
 
+  void CreateFwdPrimitive();
+  void CreateBwdDataPrimitive();
+  void CreateBwdFilterPrimitive();
+  void CreateBwdBiasPrimitive();
+
  private:
   /* Fwd step */
   shared_ptr<MKLData<Dtype> > fwd_bottom_data, fwd_top_data, fwd_filter_data,
@@ -127,6 +132,25 @@ class MKLConvolutionLayer : public ConvolutionLayer<Dtype> {
 
   bool bprop_unpack_called;
 
+  // for reshape
+  bool reshape;
+  
+  size_t bdata_sizes[4];
+  size_t bdata_strides[4];
+
+  size_t f_dimension;
+  size_t fdata_sizes[5];
+  size_t fdata_strides[5];
+
+  size_t bias_sizes[1];
+  size_t bias_strides[1];
+
+  size_t tdata_sizes[4];
+  size_t tdata_strides[4];
+
+  size_t convolutionStrides[2];
+  int    inputOffset[2];
+
   PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
   PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
   PERFORMANCE_EVENT_ID_DECL(perf_id_bw_prop_);
@@ -349,6 +373,10 @@ class MKLPoolingLayer : public Layer<Dtype> {
   shared_ptr<MKLDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
 
   dnnPrimitive_t poolingFwd, poolingBwd;
+  bool   reshape;
+  size_t dim;
+  size_t src_sizes[4], src_strides[4];
+  size_t dst_sizes[4], dst_strides[4];
 
   PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
   PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
@@ -409,6 +437,8 @@ class MKLReLULayer : public NeuronLayer<Dtype> {
   dnnPrimitive_t reluFwd_, reluBwd_;
   vector<size_t> sizes_;
   vector<size_t> strides_;
+  bool           reshape;
+  size_t         dim;
 
   PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
   PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
@@ -484,10 +514,11 @@ class MKLBatchNormLayer : public Layer<Dtype> {
         scaleShift_buffer_(static_cast<Dtype*>(NULL)),
         diffScaleShift_buffer_(static_cast<Dtype*>(NULL)),
         layout_usr_(static_cast<dnnLayout_t>(NULL)),
-        use_global_stats_(false),
         num_stats_batches_(1),
-        stats_batch_size_(0)
-      {
+        stats_batch_size_(0)      {
+        blobs_initialized_ = false;
+        use_global_stats_ = false;
+
         PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
         PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
       }
@@ -546,6 +577,8 @@ class MKLBatchNormLayer : public Layer<Dtype> {
   Dtype *diffScaleShift_buffer_;
   dnnLayout_t layout_usr_;
   bool use_global_stats_;
+
+  bool blobs_initialized_;
   int num_stats_batches_;
   int stats_batch_size_;
 
diff --git a/include/caffe/layers/mkldnn_layers.hpp b/include/caffe/layers/mkldnn_layers.hpp
index f7ce1062e..14087cd84 100644
--- a/include/caffe/layers/mkldnn_layers.hpp
+++ b/include/caffe/layers/mkldnn_layers.hpp
@@ -69,7 +69,7 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
         , bwd_top_diff(), bwd_bottom_diff()
         , BatchNormFwd_pd(), BatchNormBwd_pd()
         , scaleshift_memory(), bwd_scaleshift_diff_memory()
-        , output_memory(), bwd_bottom_diff_memory()
+        , output_memory(), bwd_bottom_diff_memory(), inplace_buffer_memory()
         , input_primitive(), bwd_top_diff_primitive()
         {
           PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
@@ -95,10 +95,12 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
     void InitBatchNormBwd(const vector<Blob<Dtype>*>& top,
             const vector<bool>& propagate_down,
             const vector<Blob<Dtype>*>& bottom);
-    void InitBatchNormFwdPrimitive(int stats_batch_idx);
-    void InitBatchNormBwdPrimitive(int stats_batch_idx);
+    void InitBatchNormFwdPrimitive(int stats_batch_idx, bool inplace);
+    void InitBatchNormBwdPrimitive(int stats_batch_idx, bool inplace);
     template <bool diff> shared_ptr<memory> GetStatsBatchMemory(
       shared_ptr<MKLDNNMemoryDescriptor<Dtype, diff> > mkldnn_data, int idx);
+    template <bool diff> shared_ptr<memory> GetStatsBatchMemoryInplace(
+      shared_ptr<MKLDNNMemoryDescriptor<Dtype, diff> > mkldnn_data, int idx, shared_ptr<memory > buffer_memory);
     void InitStatsBatchVars(int batch_size);
     shared_ptr<MKLDNNData<Dtype> > fwd_top_data, fwd_bottom_data;
     shared_ptr<MKLDNNDiff<Dtype> > bwd_top_diff, bwd_bottom_diff;
@@ -110,7 +112,8 @@ class MKLDNNBatchNormLayer : public MKLDNNLayer<Dtype>, public Layer<Dtype> {
 
     shared_ptr<memory> scaleshift_memory, bwd_scaleshift_diff_memory;
     shared_ptr<memory> output_memory, bwd_bottom_diff_memory;
-    vector<shared_ptr<memory> > input_stats, output_stats, top_diff_stats, bottom_diff_stats;
+    shared_ptr<memory> inplace_buffer_memory;
+    vector<shared_ptr<memory> > input_stats, output_stats, top_diff_stats, bottom_diff_stats, input_inplace_buffer;
 
     shared_ptr<primitive> input_primitive, bwd_top_diff_primitive;
 
@@ -175,6 +178,11 @@ class MKLDNNConvolutionLayer : public MKLDNNLayer<Dtype> , public ConvolutionLay
     int  pad_w_, pad_h_;
     mkldnn::algorithm  conv_algorithm;
 
+    /* In case of (iter_size > 1) we need additional buffers */
+    shared_ptr<MKLDNNDiff<Dtype> > bwdw_weights_diff_iter, bwdw_bias_diff_iter;
+    shared_ptr<memory> bwdw_weights_diff_memory_iter, bwdw_bias_diff_memory_iter;
+    shared_ptr<Blob<Dtype> > bwdw_weights_diff_iter_blob, bwdw_bias_diff_iter_blob;
+
     PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
     PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
     PERFORMANCE_EVENT_ID_DECL(perf_id_bw_weights_);
@@ -216,6 +224,11 @@ class MKLDNNInnerProductLayer : public MKLDNNLayer<Dtype> , public InnerProductL
                     , bwdd_top_diff_primitive, bwdd_weights_data_primitive
                     , bwdw_top_diff_primitive, bwdw_bottom_data_primitive;
     int32_t w_, h_;
+    
+    /* In case of (iter_size > 1) we need additional buffers */
+    shared_ptr<MKLDNNDiff<Dtype> > bwdw_weights_diff_iter, bwdw_bias_diff_iter;
+    shared_ptr<memory> bwdw_weights_diff_memory_iter, bwdw_bias_diff_memory_iter;
+    shared_ptr<Blob<Dtype> > bwdw_weights_diff_iter_blob, bwdw_bias_diff_iter_blob;
 
     PERFORMANCE_EVENT_ID_DECL(perf_id_fw_);
     PERFORMANCE_EVENT_ID_DECL(perf_id_bw_);
diff --git a/include/caffe/layers/pooling_layer.hpp b/include/caffe/layers/pooling_layer.hpp
index a3173cc8e..0f1e42539 100644
--- a/include/caffe/layers/pooling_layer.hpp
+++ b/include/caffe/layers/pooling_layer.hpp
@@ -90,6 +90,23 @@ class PoolingLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  // @brief The spatial dimensions of a filter kernel.
+  Blob<int> kernel_shape_;
+  // @brief The spatial dimensions of the stride.
+  Blob<int> stride_;
+  // @brief The spatial dimensions of the padding.
+  Blob<int> pad_;
+  // @brief The spatial dimensions of the input.
+  Blob<int> input_shape_;
+  // @brief The spatial dimensions of the output.
+  Blob<int> output_shape_;
+
+  int num_spatial_axes_;
+  int bottom_dim_;
+  int top_dim_;
+
+  int channel_axis_;
+  int num_;
   int kernel_h_, kernel_w_;
   int stride_h_, stride_w_;
   int pad_h_, pad_w_;
diff --git a/include/caffe/layers/psroi_pooling_layer.hpp b/include/caffe/layers/psroi_pooling_layer.hpp
new file mode 100644
index 000000000..adaa8bf47
--- /dev/null
+++ b/include/caffe/layers/psroi_pooling_layer.hpp
@@ -0,0 +1,115 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+
+#ifndef CAFFE_PSROI_POOLING_LAYER_HPP_
+#define CAFFE_PSROI_POOLING_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/** 
+ * @brief Perform position-sensitive max pooling on regions of interest specified by input, takes
+ *        as input N position-sensitive score maps and a list of R regions of interest.
+ *   ROIPoolingLayer takes 2 inputs and produces 1 output. bottom[0] is
+ *   [N x (C x K^2) x H x W] position-sensitive score maps on which pooling is performed. bottom[1] is
+ *   [R x 5] containing a list R ROI tuples with batch index and coordinates of
+ *   regions of interest. Each row in bottom[1] is a ROI tuple in format
+ *   [batch_index x1 y1 x2 y2], where batch_index corresponds to the index of
+ *   instance in the first input and x1 y1 x2 y2 are 0-indexed coordinates
+ *   of ROI rectangle (including its boundaries). The output top[0] is [R x C x K x K] score maps pooled
+ *   within the ROI tuples.
+ * @param param provides PSROIPoolingParameter psroi_pooling_param,
+ *        with PSROIPoolingLayer options:
+ *  - output_dim. The pooled output channel number.
+ *  - group_size. The number of groups to encode position-sensitive score maps
+ *  - spatial_scale. Multiplicative spatial scale factor to translate ROI
+ *  coordinates from their input scale to the scale used when pooling.
+ * R-FCN
+ * Written by Yi Li
+ */
+
+template <typename Dtype>
+class PSROIPoolingLayer : public Layer<Dtype> {
+ public:
+  explicit PSROIPoolingLayer(const LayerParameter& param)
+    : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "PSROIPooling"; }
+
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 2; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  Dtype spatial_scale_;
+  int output_dim_;
+  int group_size_;
+
+  int channels_;
+  int height_;
+  int width_;
+
+  int pooled_height_;
+  int pooled_width_;
+  Blob<int> mapping_channel_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_PSROI_POOLING_LAYER_HPP_
diff --git a/include/caffe/layers/roi_pooling_layer.hpp b/include/caffe/layers/roi_pooling_layer.hpp
new file mode 100644
index 000000000..4cba86099
--- /dev/null
+++ b/include/caffe/layers/roi_pooling_layer.hpp
@@ -0,0 +1,125 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef CAFFE_ROI_POOLING_LAYER_HPP_
+#define CAFFE_ROI_POOLING_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+namespace caffe {
+
+/** 
+ * @brief Perform max pooling on regions of interest specified by input, takes
+ *        as input N feature maps and a list of R regions of interest.
+ *
+ *   ROIPoolingLayer takes 2 inputs and produces 1 output. bottom[0] is
+ *   [N x C x H x W] feature maps on which pooling is performed. bottom[1] is
+ *   [R x 5] containing a list R ROI tuples with batch index and coordinates of
+ *   regions of interest. Each row in bottom[1] is a ROI tuple in format
+ *   [batch_index x1 y1 x2 y2], where batch_index corresponds to the index of
+ *   instance in the first input and x1 y1 x2 y2 are 0-indexed coordinates
+ *   of ROI rectangle (including its boundaries).
+ *
+ *   For each of the R ROIs, max-pooling is performed over pooled_h x pooled_w
+ *   output bins (specified in roi_pooling_param). The pooling bin sizes are
+ *   adaptively set such that they tile ROI rectangle in the indexed feature
+ *   map. The pooling region of vertical bin ph in [0, pooled_h) is computed as
+ *
+ *    start_ph (included) = y1 + floor(ph * (y2 - y1 + 1) / pooled_h)
+ *    end_ph (excluded)   = y1 + ceil((ph + 1) * (y2 - y1 + 1) / pooled_h)
+ *
+ *   and similar horizontal bins.
+ *
+ * @param param provides ROIPoolingParameter roi_pooling_param,
+ *        with ROIPoolingLayer options:
+ *  - pooled_h. The pooled output height.
+ *  - pooled_w. The pooled output width
+ *  - spatial_scale. Multiplicative spatial scale factor to translate ROI
+ *  coordinates from their input scale to the scale used when pooling.
+ *
+ * Fast R-CNN
+ * Written by Ross Girshick
+ */
+
+template <typename Dtype>
+class ROIPoolingLayer : public Layer<Dtype> {
+ public:
+  explicit ROIPoolingLayer(const LayerParameter& param)
+      : Layer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "ROIPooling"; }
+
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 2; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 1; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  int channels_;
+  int num_spatial_axes_;
+  int depth_;
+  int height_;
+  int width_;
+  int pooled_d_;
+  int pooled_h_;
+  int pooled_w_;
+  Dtype spatial_scale_;
+  Blob<int> max_idx_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_ROI_POOLING_LAYER_HPP_
diff --git a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
index 02eecc14c..c7247b385 100644
--- a/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
+++ b/include/caffe/layers/sigmoid_cross_entropy_loss_layer.hpp
@@ -92,6 +92,10 @@ class SigmoidCrossEntropyLossLayer : public LossLayer<Dtype> {
 
   virtual inline const char* type() const { return "SigmoidCrossEntropyLoss"; }
 
+  virtual inline int ExactNumBottomBlobs() const { return -1; }
+  virtual inline int MinBottomBlobs() const { return 2; }
+  virtual inline int MaxBottomBlobs() const { return 3; }
+
  protected:
   /// @copydoc SigmoidCrossEntropyLossLayer
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
diff --git a/include/caffe/layers/smooth_L1_loss_layer.hpp b/include/caffe/layers/smooth_L1_loss_layer.hpp
index 28bd55928..37f488415 100644
--- a/include/caffe/layers/smooth_L1_loss_layer.hpp
+++ b/include/caffe/layers/smooth_L1_loss_layer.hpp
@@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <vector>
 
 #include "caffe/blob.hpp"
+#include "caffe/common.hpp"
 #include "caffe/layer.hpp"
 #include "caffe/proto/caffe.pb.h"
 
@@ -72,8 +73,9 @@ class SmoothL1LossLayer : public LossLayer<Dtype> {
 
   virtual inline const char* type() const { return "SmoothL1Loss"; }
 
+  virtual inline int ExactNumBottomBlobs() const { return -1; }
   virtual inline int MinBottomBlobs() const { return 2; }
-  virtual inline int MaxBottomBlobs() const { return 3; }
+  virtual inline int MaxBottomBlobs() const { return 4; }
 
   /**
    * Unlike most loss layers, in the SmoothL1LossLayer we can backpropagate
@@ -97,7 +99,9 @@ class SmoothL1LossLayer : public LossLayer<Dtype> {
 
   Blob<Dtype> diff_;
   Blob<Dtype> errors_;
+  Blob<Dtype> ones_;
   bool has_weights_;
+  Dtype sigma2_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/smooth_L1_loss_ohem_layer.hpp b/include/caffe/layers/smooth_L1_loss_ohem_layer.hpp
new file mode 100644
index 000000000..45ca8953f
--- /dev/null
+++ b/include/caffe/layers/smooth_L1_loss_ohem_layer.hpp
@@ -0,0 +1,116 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+
+#ifndef CAFFE_SMOOTH_L1_LOSS_OHEM_LAYER_HPP_
+#define CAFFE_SMOOTH_L1_LOSS_OHEM_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief SmoothL1LossOHEMLayer
+ *
+ * R-FCN
+ * Written by Yi Li
+ */
+  template <typename Dtype>
+  class SmoothL1LossOHEMLayer : public LossLayer<Dtype> {
+   public:
+    explicit SmoothL1LossOHEMLayer(const LayerParameter& param)
+      : LossLayer<Dtype>(param), diff_() {}
+    virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+    virtual inline const char* type() const { return "SmoothL1LossOHEM"; }
+
+    virtual inline int ExactNumBottomBlobs() const { return -1; }
+    virtual inline int MinBottomBlobs() const { return 2; }
+    virtual inline int MaxBottomBlobs() const { return 3; }
+    virtual inline int ExactNumTopBlobs() const { return -1; }
+    virtual inline int MinTopBlobs() const { return 1; }
+    virtual inline int MaxTopBlobs() const { return 2; }
+
+    /**
+    * Unlike most loss layers, in the SmoothL1LossOHEMLayer we can backpropagate
+    * to both inputs -- override to return true and always allow force_backward.
+    */
+    virtual inline bool AllowForceBackward(const int bottom_index) const {
+      return true;
+    }
+
+   protected:
+    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+    /// Read the normalization mode parameter and compute the normalizer based
+    /// on the blob size.
+    virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode,
+      Dtype pre_fixed_normalizer);
+
+    Blob<Dtype> diff_;
+    Blob<Dtype> errors_;
+    bool has_weights_;
+
+    int outer_num_, inner_num_;
+
+    /// How to normalize the output loss.
+    LossParameter_NormalizationMode normalization_;
+  };
+
+}  // namespace caffe
+
+#endif  // CAFFE_SMOOTH_L1_LOSS_OHEM_LAYER_HPP_
diff --git a/include/caffe/layers/softmax_loss_ohem_layer.hpp b/include/caffe/layers/softmax_loss_ohem_layer.hpp
new file mode 100644
index 000000000..0a7f6872c
--- /dev/null
+++ b/include/caffe/layers/softmax_loss_ohem_layer.hpp
@@ -0,0 +1,171 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+
+#ifndef CAFFE_SOFTMAX_WITH_LOSS_OHEM_LAYER_HPP_
+#define CAFFE_SOFTMAX_WITH_LOSS_OHEM_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/loss_layer.hpp"
+#include "caffe/layers/softmax_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Computes the multinomial logistic loss for a one-of-many
+ *        classification task, passing real-valued predictions through a
+ *        softmax to get a probability distribution over classes.
+ *        An additional per-instance loss is produced in output for OHEM 
+ *
+ * This layer should be preferred over separate
+ * SoftmaxLayer + MultinomialLogisticLossLayer
+ * as its gradient computation is more numerically stable.
+ * At test time, this layer can be replaced simply by a SoftmaxLayer.
+ *
+ * @param bottom input Blob vector (length 2)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the predictions @f$ x @f$, a Blob with values in
+ *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+ *      the @f$ K = CHW @f$ classes. This layer maps these scores to a
+ *      probability distribution over classes using the softmax function
+ *      @f$ \hat{p}_{nk} = \exp(x_{nk}) /
+ *      \left[\sum_{k'} \exp(x_{nk'})\right] @f$ (see SoftmaxLayer).
+ *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+ *      the labels @f$ l @f$, an integer-valued Blob with values
+ *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
+ *      indicating the correct class label among the @f$ K @f$ classes
+ * @param top output Blob vector (length 1)
+ *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+ *      the computed cross-entropy classification loss: @f$ E =
+ *        \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n})
+ *      @f$, for softmax output class probabilites @f$ \hat{p} @f$
+ *      @f$, per-instance cross-entropy classification loss
+ */
+template <typename Dtype>
+class SoftmaxWithLossOHEMLayer : public LossLayer<Dtype> {
+ public:
+   /**
+    * @param param provides LossParameter loss_param, with options:
+    *  - ignore_label (optional)
+    *    Specify a label value that should be ignored when computing the loss.
+    *  - normalize (optional, default true)
+    *    If true, the loss is normalized by the number of (nonignored) labels
+    *    present; otherwise the loss is simply summed over spatial locations.
+    */
+  explicit SoftmaxWithLossOHEMLayer(const LayerParameter& param)
+      : LossLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "SoftmaxWithLossOHEM"; }
+  virtual inline int ExactNumTopBlobs() const { return -1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 3; }
+
+ protected:
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  /**
+   * @brief Computes the softmax loss error gradient w.r.t. the predictions.
+   *
+   * Gradients cannot be computed with respect to the label inputs (bottom[1]),
+   * so this method ignores bottom[1] and requires !propagate_down[1], crashing
+   * if propagate_down[1] is set.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
+   *      This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
+   *      as @f$ \lambda @f$ is the coefficient of this layer's output
+   *      @f$\ell_i@f$ in the overall Net loss
+   *      @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
+   *      @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
+   *      (*Assuming that this top Blob is not used as a bottom (input) by any
+   *      other layer of the Net.)
+   * @param propagate_down see Layer::Backward.
+   *      propagate_down[1] must be false as we can't compute gradients with
+   *      respect to the labels.
+   * @param bottom input Blob vector (length 2)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the predictions @f$ x @f$; Backward computes diff
+   *      @f$ \frac{\partial E}{\partial x} @f$
+   *   -# @f$ (N \times 1 \times 1 \times 1) @f$
+   *      the labels -- ignored as we can't compute their error gradients
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// Read the normalization mode parameter and compute the normalizer based
+  /// on the blob size.  If normalization_mode is VALID, the count of valid
+  /// outputs will be read from valid_count, unless it is -1 in which case
+  /// all outputs are assumed to be valid.
+  virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode, int valid_count);
+
+  /// The internal SoftmaxLayer used to map predictions to a distribution.
+  shared_ptr<Layer<Dtype> > softmax_layer_;
+  /// prob stores the output probability predictions from the SoftmaxLayer.
+  Blob<Dtype> prob_;
+  /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_bottom_vec_;
+  /// top vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_top_vec_;
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+  /// How to normalize the output loss.
+  LossParameter_NormalizationMode normalization_;
+
+  int softmax_axis_, outer_num_, inner_num_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SOFTMAX_WITH_LOSS_OHEM_LAYER_HPP_
diff --git a/include/caffe/layers/spatial_dropout_layer.hpp b/include/caffe/layers/spatial_dropout_layer.hpp
new file mode 100644
index 000000000..2eb947108
--- /dev/null
+++ b/include/caffe/layers/spatial_dropout_layer.hpp
@@ -0,0 +1,118 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CAFFE_SPATIALDROPOUT_LAYER_HPP_
+#define CAFFE_SPATIALDROPOUT_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief During training only, sets a random portion of @f$x@f$ to 0, adjusting
+ *        the rest of the vector magnitude accordingly.
+ *
+ * @param bottom input Blob vector (length 1)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the inputs @f$ x @f$
+ * @param top output Blob vector (length 1)
+ *   -# @f$ (N \times C \times H \times W) @f$
+ *      the computed outputs @f$ y = |x| @f$
+ */
+template <typename Dtype>
+class SpatialDropoutLayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides SpatialDropoutParameter spatial_dropout_param,
+   *     with SpatialDropoutLayer options:
+   *   - dropout_ratio (\b optional, default 0.5).
+   *     Sets the probability @f$ p @f$ that any given unit is dropped.
+   */
+  explicit SpatialDropoutLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "SpatialDropout"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs. At training time, we have @f$
+   *      y_{\mbox{train}} = \left\{
+   *         \begin{array}{ll}
+   *            \frac{x}{1 - p} & \mbox{if } u > p \\
+   *            0 & \mbox{otherwise}
+   *         \end{array} \right.
+   *      @f$, where @f$ u \sim U(0, 1)@f$ is generated independently for each
+   *      input at each iteration. At test time, we simply have
+   *      @f$ y_{\mbox{test}} = \mathbb{E}[y_{\mbox{train}}] = x @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// when divided by UINT_MAX, the randomly generated values @f$u\sim U(0,1)@f$
+  Blob<unsigned int> rand_vec_;
+  /// the probability @f$ p @f$ of dropping any input
+  Dtype threshold_;
+  /// the scale for undropped inputs at train time @f$ 1 / (1 - p) @f$
+  Dtype scale_;
+  unsigned int uint_thres_;
+  bool scale_train_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SPATIALDROPOUT_LAYER_HPP_
diff --git a/include/caffe/mkldnn_memory.hpp b/include/caffe/mkldnn_memory.hpp
index 3b1a1c6ad..597ab95df 100644
--- a/include/caffe/mkldnn_memory.hpp
+++ b/include/caffe/mkldnn_memory.hpp
@@ -204,7 +204,7 @@ class MKLDNNMemoryDescriptor : public MKLDNNMemoryDescriptorBase<Dtype> {
     shared_ptr<memory> create_output_memory(bool inplace = false);
     Dtype* get_memory_ptr(long offset = 0);
     shared_ptr<memory::desc> get_memory_desc();
-
+    size_t get_memory_count();
     void set_mkldnn_primitive(MKLDNNPrimitive<Dtype>& mprimitive) { CHECK(mprimitive.aprimitive); _mkldnn_primitive = mprimitive;  }
     MKLDNNPrimitive<Dtype>&  mkldnn_primitive() { return _mkldnn_primitive; }
     shared_ptr<primitive> aprimitive() const { return _mkldnn_primitive.aprimitive; }
diff --git a/include/caffe/multinode/async_param_server.hpp b/include/caffe/multinode/async_param_server.hpp
new file mode 100644
index 000000000..41ff1442e
--- /dev/null
+++ b/include/caffe/multinode/async_param_server.hpp
@@ -0,0 +1,146 @@
+/*
+All modification made by Intel Corporation: © 2017 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+// Initial implementation from Jian Zhang and Ioannis Mitliagkas, Stanford, on Oct 2 2016
+// Refer to the paper: https://arxiv.org/pdf/1708.05256.pdf
+
+#ifndef ASYNC_PARAM_SERVER_H
+#define ASYNC_PARAM_SERVER_H
+
+#include <iostream>
+#include <vector>
+#include <deque>
+#include <map>
+#include <boost/thread.hpp>
+#include <cstdlib>
+#include <mpi.h>
+
+#include "caffe/caffe.hpp"
+#include "caffe/multinode/multi_solver.hpp"
+
+namespace caffe {
+  namespace mn {
+
+    using std::make_pair;
+
+    // TODO modify decoding strategy
+    // we use TAG = param_id * 10 + part_id + 1973 to identify parts location
+    struct TaskRequest {
+      int part_root_rank_;
+      int layer_id_; 
+      int param_id_;
+      int part_id_;
+      int num_parts_;
+      MPI_Request mpi_request_;
+  
+      TaskRequest(): part_root_rank_(0), layer_id_(0), param_id_(0), part_id_(0),
+                     num_parts_(1), mpi_request_() {}
+      TaskRequest(int root_rank, int layer_id, int param_id, int part_id, int num_parts) :
+        part_root_rank_(root_rank), layer_id_(layer_id), param_id_(param_id), part_id_(part_id),
+        num_parts_(num_parts) {}
+
+      int GetTag() {
+        return param_id_ * 10 + part_id_ + 1973;
+      }
+    };
+
+    // protocol:
+    // when get a non-blocking mpi receive, comm thread submit a job to the 
+    // update_tasks_ queue. 
+    // The compute thread will check the update_tasks_ queue. After it finishes
+    // update, the compute thread will submit request to send_tasks_ queue.
+    // In the communicate loop, the thead consider send task first, and then 
+    // process receive tasks.
+    template <typename Dtype>
+    class AsyncParamServer {
+    public:
+      AsyncParamServer(boost::shared_ptr<Solver<Dtype> > solver);
+      ~AsyncParamServer();
+      // in the update task, the compute thread 
+      // 0. lock the mutex on blob
+      // 1. copy buffer to solvers diff buffer
+      // 2. perform updates
+      // 3. copy the model to the corresponding mpi buffer
+      // 4. submit a send task
+      // 5. unlock the mutex blob
+      void ProcessUpdateTask();
+      // in the Send task, we use non-blocking send for model parts going back to roots
+      // We do not need to care about the request. Because if the blocking recv
+      // has not finished on root, it will not start a new send task
+      void ProcessSendTask();
+      // We iterate over the recv_tasks_ vector, when the request is done, we start a
+      // new corresponding MPI non-blocking recv call.
+      void ProcessRecvTask();
+      void ComputeLoop();
+      void CommLoop();
+      void Run();
+
+    private:
+      // for communication
+      std::deque<TaskRequest> update_tasks_;
+      std::deque<TaskRequest> send_tasks_;
+      boost::mutex update_queue_mutex_;
+      boost::mutex send_queue_mutex_;
+      int recv_tasks_iter_;
+      std::vector<TaskRequest> recv_tasks_;
+      // part_root_rank, param_id
+      std::map<std::pair<int, int>, int> rank_layer_blob_to_vec_pos;
+      // root_rank, param_id
+      std::map<std::pair<int, int>, std::pair<Dtype*, int64_t> > recv_buf_;
+      std::map<std::pair<int, int>, std::pair<Dtype*, int64_t> > send_buf_;
+
+      // for computation
+      boost::shared_ptr<Solver<Dtype> > solver_;
+
+      // for termination: count the number of operations 
+      // needed in total
+      int64_t send_cnt_;
+      int64_t update_cnt_; 
+      int64_t total_send_;
+      int64_t total_update_;
+
+      // iter for different blobs
+      // param_id, part_id
+      std::map<std::pair<int, int>, int64_t> async_iter_;
+
+    };
+
+  } // end of namespace async_param_server
+
+} // end of namespace caffe
+
+#endif /*ASYNC_PARAM_SERVER_H*/
diff --git a/include/caffe/multinode/mlsl.hpp b/include/caffe/multinode/mlsl.hpp
index b0d3d13d6..336dc6028 100644
--- a/include/caffe/multinode/mlsl.hpp
+++ b/include/caffe/multinode/mlsl.hpp
@@ -41,6 +41,7 @@
 #ifdef USE_MLSL
 
 #include <mlsl.hpp>
+#include <mpi.h>
 #include "caffe/common.hpp"
 
 namespace caffe {
@@ -66,19 +67,73 @@ namespace caffe {
       return MLSL::Environment::GetEnv().GetProcessCount();
     }
 
-    inline int get_group_id(int data_parts, int model_parts) {
-      int node_id = get_node_id();
-      int num_nodes = get_nodes_count();
-      return (node_id % (num_nodes / data_parts)) / model_parts;
+    inline bool is_root() {
+      return mn::get_node_id() == 0;
+    }
+
+    template <typename Dtype>
+    MPI_Datatype DtypeToMPIDtype();
+
+    template <typename Dtype>
+    MLSL::DataType DtypeToMLSLDtype();
+
+    extern int nGroup;
+    extern int nServer;
+
+    inline bool use_param_server() {
+      return nServer > 0;
+    }
+
+    inline int get_world_size() {
+      int size = 0;
+      MPI_Comm_size(MPI_COMM_WORLD, &size);
+      return size;
+    }
+
+    inline int get_node_rank() {
+      int rank = -1;
+      MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+      return rank;
     }
 
     inline bool is_multinode() {
-      static bool multinode{ get_nodes_count() > 1 };
+      static bool multinode{ get_world_size() > 1 };
       return multinode;
     }
 
-    inline bool is_root() {
-      return mn::get_node_id() == 0;
+    inline int get_num_groups() {
+      return nGroup;
+    }
+
+    inline int get_group_size() {
+      return (get_world_size() - nServer) / nGroup;
+    }
+
+    inline int get_global_part_id(int data_parts, int model_parts) {
+      int node_id = get_node_id();
+      int num_nodes = get_group_size();
+      return (node_id % (num_nodes / data_parts)) / model_parts;
+    }
+
+    inline int get_group_id() {
+      return get_node_rank() / get_group_size();
+    }
+
+    inline int world_rank_to_root_rank(int world_rank) {
+      int group_size = get_group_size();
+      return world_rank / group_size * group_size;
+    }
+
+    inline int get_group_root_rank(int group_id, int part_id = 0, int num_parts = 1) {
+      return group_id * get_group_size() + part_id;
+    }
+
+    inline int param_to_server_rank(int layer_id, int param_id) {
+      return (param_id % nServer) + nGroup * get_group_size();
+    }
+
+    inline bool is_param_server() {
+      return get_node_rank() >= nGroup * get_group_size();
     }
 
     namespace detail {
@@ -128,12 +183,22 @@ namespace caffe {
         MLSL::CommReq *rqts = distrib_->Reduce((void *)sendBuffer, (void*)recvBuffer, count, detail::dtype<Dtype>(), Rtype, rootIdx, Gtype);
         MLSL::Environment::GetEnv().Wait(rqts);
       }
+      template <typename Dtype, MLSL::ReductionType Rtype, MLSL::GroupType Gtype>
+      MLSL::CommReq* reduce_async(Dtype *sendBuffer, Dtype *recvBuffer, size_t count, size_t rootIdx = 0) {
+        if (skip_comm(Gtype)) return NULL;
+        return distrib_->Reduce((void *)sendBuffer, (void*)recvBuffer, count, detail::dtype<Dtype>(), Rtype, rootIdx, Gtype);
+      }
       template <typename Dtype, MLSL::GroupType Gtype>
       void bcast(Dtype *buffer, size_t count, int rootId = 0) {
         if (skip_comm(Gtype)) return;
         MLSL::CommReq *rqts = distrib_->Bcast((void *)buffer, count, detail::dtype<Dtype>(), rootId, Gtype);
         MLSL::Environment::GetEnv().Wait(rqts);
       }
+      template <typename Dtype, MLSL::GroupType Gtype>
+      MLSL::CommReq* bcast_async(Dtype *buffer, size_t count, int rootId = 0) {
+        if (skip_comm(Gtype)) return NULL;
+        return distrib_->Bcast((void *)buffer, count, detail::dtype<Dtype>(), rootId, Gtype);
+      }
       template <typename Dtype, MLSL::ReductionType Rtype, MLSL::GroupType Gtype>
       void allreduce(Dtype *sendBuffer, Dtype *recvBuffer, size_t count) {
         if (skip_comm(Gtype)) return;
@@ -190,14 +255,23 @@ namespace caffe {
         if (skip_comm(Gtype)) return;
         distrib_->Barrier(Gtype);
       }
+      inline int get_node_id(MLSL::GroupType Gtype = MLSL::GroupType::GT_GLOBAL) {
+        return distrib_->GetProcessIdx(Gtype);
+      }
+      inline int get_nodes_count(MLSL::GroupType Gtype = MLSL::GroupType::GT_GLOBAL) {
+        return distrib_->GetProcessCount(Gtype);
+      }
+      inline int is_root(MLSL::GroupType Gtype = MLSL::GroupType::GT_GLOBAL) {
+        return get_node_id(Gtype) == 0;
+      }
       inline int get_data_parts() {
         return data_parts_;
       }
       inline int get_model_parts() {
         return model_parts_;
       }
-      inline int get_group_id() {
-        return mn::get_group_id(data_parts_, model_parts_);
+      inline int get_global_part_id() {
+        return mn::get_global_part_id(data_parts_, model_parts_);
       }
     private:
       inline bool skip_comm(MLSL::GroupType Gtype) {
@@ -205,7 +279,7 @@ namespace caffe {
           return data_color_ > data_color_max_;
         } else if (Gtype == MLSL::GT_MODEL && model_color_max_ != MLSL_DEFAULT_COLOR) {
           return model_color_ > model_color_max_;
-        } else return get_group_id() > 0;
+        } else return get_global_part_id() > 0;
       }
 
       MLSL::Distribution *distrib_{ nullptr };
@@ -218,13 +292,17 @@ namespace caffe {
     };
 
     inline void GetCanonicalMnParam(int &num_nodes, int &model_parts) {
-      if (num_nodes == 0) num_nodes = mn::get_nodes_count();
+      if (num_nodes == 0) num_nodes = mn::get_group_size();
       if (model_parts == 0 || model_parts > num_nodes) model_parts = num_nodes;
     }
 
     shared_ptr<Distribution> create_distrib(
-      int dataParts, int modelParts, int dataColor = MLSL_DEFAULT_COLOR, int modelColor = MLSL_DEFAULT_COLOR,
-      int dataColorMax = MLSL_DEFAULT_COLOR, int modelColorMax = MLSL_DEFAULT_COLOR);
+      int dataParts, int modelParts, int dataColor, int modelColor,
+      int dataColorMax = MLSL_DEFAULT_COLOR,
+      int modelColorMax = MLSL_DEFAULT_COLOR);
+    boost::shared_ptr<Distribution> create_distrib(int dataParts, int modelParts);
+    boost::shared_ptr<Distribution> create_distrib();
+
     Distribution * get_distrib(int dataParts, int modelParts);
     Distribution * get_distrib();
 
diff --git a/include/caffe/multinode/multi_solver.hpp b/include/caffe/multinode/multi_solver.hpp
index 5d2082821..771aa52df 100644
--- a/include/caffe/multinode/multi_solver.hpp
+++ b/include/caffe/multinode/multi_solver.hpp
@@ -74,8 +74,7 @@ class MultiSolver {
   protected:
     virtual ~Callback() {
     }
-    virtual void on_iter_finished(int layer_id) = 0;
-
+    virtual void on_backward_finished(int layer_id) = 0;
     virtual void on_delwt_wait(int layer_id) = 0;
     virtual void apply_updates(int layer_id) = 0;
 
diff --git a/include/caffe/multinode/multi_sync.hpp b/include/caffe/multinode/multi_sync.hpp
index 905d9fce7..61a79f5a2 100644
--- a/include/caffe/multinode/multi_sync.hpp
+++ b/include/caffe/multinode/multi_sync.hpp
@@ -56,16 +56,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <string>
 #include <utility>
 #include <vector>
+#include <list>
 
 #include "caffe/caffe.hpp"
 #include "caffe/multinode/mlsl.hpp"
 #include "caffe/multinode/multi_solver.hpp"
+#include "caffe/multinode/async_param_server.hpp"
 
 namespace caffe {
 
 #define CAN_USE_PRV_DATA(param) (param->prv_data() && (param->prv_data_count() == param->count()))
 #define CAN_USE_PRV_DIFF(param) (param->prv_diff() && (param->prv_diff_count() == param->count()))
 
+  struct AsyncTask {
+    int layer_id;
+    int param_id;
+    MLSL::CommReq* req;
+    AsyncTask() : layer_id(-1), param_id(-1), req() {};
+    AsyncTask(int layer_id, int param_id, MLSL::CommReq* req) :
+      layer_id(layer_id), param_id(param_id), req(req) {}
+  };
+
   template <typename Dtype>
   class MultiSync : public MultiSolver<Dtype>::Callback {
 
@@ -83,6 +94,16 @@ namespace caffe {
     // statistics
     vector<vector<shared_ptr<Blob<Dtype>>>> cached_stats;
 
+    // if use_param_server == true
+    vector<MLSL::CommReq*> reduce_req_vec;
+    std::list<AsyncTask> reduce_req_list;
+    vector<MPI_Request> irecv_req_vec;
+    vector<MLSL::CommReq*> broadcast_req_vec;
+    vector<bool> irecv_done;
+    vector<bool> broadcast_launched;
+    std::list<mn::TaskRequest> irecv_req_list;
+    boost::shared_ptr<mn::Distribution> distrib_bcast;
+
 #ifdef PERFORMANCE_MONITORING
     #define STATS_OUTPUT_FILE "mlsl_stats.txt"
 
@@ -214,7 +235,7 @@ namespace caffe {
       DLOG(INFO) << "started iteration " << solver->root_solver()->iter();
     }
 
-    void on_iter_finished(int layer_id) {
+    void launch_allreduce(int layer_id) {
       boost::shared_ptr<Layer<Dtype>> &layer = layers[layer_id];
       if (layer->layerOp == nullptr) {
         return;
@@ -230,14 +251,209 @@ namespace caffe {
       for (int i = 0; i < param_ids.size(); ++i) {
         if (!layer->ParamNeedReduce(i)) continue;
         if (CAN_USE_PRV_DIFF(net_params[param_ids[i]])) {
-          layer->layerOp->GetParameterSet(i)->StartGradientComm((void *) net_params[param_ids[i]]->mutable_prv_diff());
+          layer->layerOp->GetParameterSet(i)->StartGradientComm(
+              (void *) net_params[param_ids[i]]->mutable_prv_diff());
         } else {
-          layer->layerOp->GetParameterSet(i)->StartGradientComm((void *) net_params[param_ids[i]]->mutable_cpu_diff());
+          layer->layerOp->GetParameterSet(i)->StartGradientComm(
+              (void *) net_params[param_ids[i]]->mutable_cpu_diff());
         }
       }
     }
 
-    void on_delwt_wait(int layer_id) {
+    void launch_reduce(int layer_id, int param_id) {
+      mn::Distribution& distrib = layers[layer_id]->GetDistribution();
+      Dtype* send_buff = NULL;
+      Dtype* recv_buff = NULL;
+      size_t buf_size = net_params[param_id]->count();
+      if (CAN_USE_PRV_DIFF(net_params[param_id])) {
+        send_buff = (Dtype*)net_params[param_id]->prv_diff();
+        recv_buff = net_params[param_id]->mutable_prv_diff();
+      }
+      else {
+        send_buff = (Dtype*)net_params[param_id]->cpu_diff();
+        recv_buff = net_params[param_id]->mutable_cpu_diff();
+      }
+      reduce_req_vec[param_id] =
+        distrib.reduce_async<Dtype,MLSL::ReductionType::RT_SUM,MLSL::GroupType::GT_DATA>(
+          send_buff, recv_buff, buf_size);
+      if (reduce_req_vec[param_id] != NULL && distrib.is_root(MLSL::GroupType::GT_DATA)) {
+        AsyncTask req_task(layer_id, param_id, NULL);
+        reduce_req_list.push_back(req_task);
+      }
+    }
+
+    void check_and_launch_comm_to_ps() {
+      std::list<AsyncTask>::iterator iter = reduce_req_list.begin();
+      int mpi_rank = mn::get_node_rank();
+      while (iter != reduce_req_list.end()) {
+        bool complete = false;
+        if (reduce_req_vec[iter->param_id] == NULL)
+          complete = true;
+        else {
+          MLSL::Environment::GetEnv().Test(reduce_req_vec[iter->param_id], &complete);
+        }
+        if (complete) {
+          // reset req to indicate no need to do Wait
+          reduce_req_vec[iter->param_id] = NULL;
+
+          void* send_buff;
+          void* recv_buff;
+          int param_id = iter->param_id;
+          size_t buf_size = net_params[param_id]->count();
+          
+          if (CAN_USE_PRV_DIFF(net_params[param_id] ) ) {
+            send_buff = (void*)net_params[param_id]->prv_diff();
+          }
+          else {
+            send_buff = (void*)net_params[param_id]->cpu_diff();
+          }
+          if (CAN_USE_PRV_DATA(net_params[param_id] ) ) {
+            recv_buff = (void*)net_params[param_id]->mutable_prv_data();
+          }
+          else {
+            recv_buff = (void*)net_params[param_id]->mutable_cpu_data();
+          }
+          mn::Distribution &distrib = layers[iter->layer_id]->GetDistribution();
+          int server_mpi_rank = mn::param_to_server_rank(iter->layer_id, iter->param_id);
+          mn::TaskRequest task(
+            mpi_rank, iter->layer_id, iter->param_id,
+            distrib.get_node_id(MLSL::GroupType::GT_MODEL),
+            distrib.get_nodes_count(MLSL::GroupType::GT_MODEL));
+          int tag = task.GetTag();
+          MPI_Request send_req;
+          int recv_flag = 1;
+           // recv from PS
+          MPI_Irecv(recv_buff, buf_size, mn::DtypeToMPIDtype<Dtype>(),
+                    server_mpi_rank, tag, MPI_COMM_WORLD, &irecv_req_vec[param_id]);
+          MPI_Test(&irecv_req_vec[param_id], &recv_flag, MPI_STATUS_IGNORE);
+          CHECK(!recv_flag);
+          // Send to PS
+          MPI_Isend(send_buff, buf_size, mn::DtypeToMPIDtype<Dtype>(),
+                    server_mpi_rank, tag, MPI_COMM_WORLD, &send_req);
+          // TODO: why do we have to wait here?
+          MPI_Wait(&send_req, MPI_STATUS_IGNORE);
+
+          irecv_req_list.push_back(task);
+          iter = reduce_req_list.erase(iter);
+        }
+        else iter++;
+      }
+    }
+
+    void launch_param_broadcast(int layer_id, int param_id) {
+      Dtype* buff;
+      if (CAN_USE_PRV_DATA(net_params[param_id])) {
+        if (distrib_bcast->is_root(MLSL::GroupType::GT_DATA))
+          buff = (Dtype*)net_params[param_id]->prv_data();
+        else
+          buff = net_params[param_id]->mutable_prv_data();
+      }
+      else {
+        if (distrib_bcast->is_root(MLSL::GroupType::GT_DATA))
+          buff = (Dtype*)net_params[param_id]->cpu_data();
+        else
+          buff = net_params[param_id]->mutable_cpu_data();
+      }
+      size_t buf_size = net_params[param_id]->count();
+      broadcast_req_vec[param_id] =
+          distrib_bcast->bcast_async<Dtype,MLSL::GroupType::GT_DATA>(buff, buf_size);
+    }
+
+    void check_and_launch_broadcast() {
+      std::list<mn::TaskRequest>::iterator iter = irecv_req_list.begin();
+      while (iter != irecv_req_list.end() ) {
+        int flag = 0;
+        int param_id = iter->param_id_;
+        if (irecv_done[param_id]) {
+          flag = 1;
+        } else {
+          MPI_Test(&irecv_req_vec[param_id], &flag, MPI_STATUS_IGNORE);
+        }
+        if (flag) {
+          irecv_req_vec[param_id] = MPI_REQUEST_NULL;
+          irecv_done[param_id] = true;
+          iter = irecv_req_list.erase(iter);
+        }
+        else
+          iter++;
+      }
+      // Make sure the order of bcast is the same inside the group:
+      // Layers and net params in reverse order
+      // TODO: relax this ordering constraints for more efficient
+      // communication
+      for (int i = layers.size() - 1; i >= 0; i--) {
+        for (int j = layer_param_ids[i].size() - 1; j >= 0; j--) {
+          int param_id = layer_param_ids[i][j];
+          if (!broadcast_launched[param_id]) {
+            if (irecv_done[param_id]) {
+              launch_param_broadcast(i, param_id);
+              broadcast_launched[param_id] = true;
+            } else return;
+          }
+        }
+      }
+    }
+
+    void on_backward_finished(int layer_id) {
+      boost::shared_ptr<Layer<Dtype>> &layer = layers[layer_id];
+      if (layer->layerOp == nullptr) {
+        return;
+      }
+
+      if (mn::use_param_server()) {
+        std::vector<int> &param_ids = layer_param_ids[layer_id];
+        // TODO: descending is faster?
+        for (int i = param_ids.size() - 1; i >= 0; --i) {
+          if (!layer->ParamNeedReduce(i)) continue;
+          launch_reduce(layer_id, param_ids[i]);
+          mn::Distribution &distrib = layer->GetDistribution();
+          if (distrib.is_root(MLSL::GroupType::GT_DATA)) {
+            check_and_launch_comm_to_ps();
+            check_and_launch_broadcast();
+          } else {
+            launch_param_broadcast(layer_id, param_ids[i]);
+          }
+        }
+      } else {
+        launch_allreduce(layer_id);
+      }
+    }
+
+    void delwt_wait_ps(int layer_id) {
+      mn::Distribution &distrib = layers[layer_id]->GetDistribution();
+      if (distrib.is_root(MLSL::GroupType::GT_DATA)) {
+        std::vector<int> &param_ids = layer_param_ids[layer_id];
+        // TODO: can we start comm with ps earlier? Per-layer data would be inconsistent then.
+        check_and_launch_comm_to_ps();
+        check_and_launch_broadcast();
+        for (int i = param_ids.size() - 1; i >= 0; i--) {
+          int param_id = param_ids[i];
+          // wait for reduce
+          if (reduce_req_vec[param_id] != NULL) {
+            MLSL::Environment::GetEnv().Wait(reduce_req_vec[param_id]);
+          }
+          reduce_req_vec[param_id] = NULL;
+          // wait for new param from param server
+          if (irecv_req_vec[param_id] != MPI_REQUEST_NULL) {
+            MPI_Wait(&irecv_req_vec[param_id], MPI_STATUS_IGNORE);
+            // the req is set to MPI_Request_NULL indicating the request is already finished
+            irecv_req_vec[param_id] = MPI_REQUEST_NULL;
+          }
+          irecv_done[param_id] = false;
+          // wait for the completion of broadcast
+          if (broadcast_req_vec[param_id] != NULL) {
+            MLSL::Environment::GetEnv().Wait(broadcast_req_vec[param_id]);
+            broadcast_req_vec[param_id] = NULL;
+          }
+          broadcast_launched[param_id] = false;
+        }
+      }
+#ifdef FW_OVERLAP_OPT
+      solver->set_layer_finished_flag(layer_id, true);
+#endif
+    }
+
+    void delwt_wait_no_ps(int layer_id) {
       boost::shared_ptr<Layer<Dtype>> &layer = layers[layer_id];
       if (layer->layerOp == nullptr) {
 #ifdef FW_OVERLAP_OPT
@@ -290,6 +506,14 @@ namespace caffe {
 #endif
     }
 
+    void on_delwt_wait(int layer_id) {
+      if (mn::use_param_server()) {
+        delwt_wait_ps(layer_id);
+      } else {
+        delwt_wait_no_ps(layer_id);
+      }
+    }
+
     void on_gradients_ready() {
       DLOG(INFO) << "finished iteration " << solver->root_solver()->iter();
 
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index ba47be986..362b28de7 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -310,6 +310,19 @@ class Net {
                                                 const NetParameter& param,
                                                 int layer_id);
 
+  static void GetNeedToCancelInplaceLayers(
+      vector<vector<const LayerParameter*>>& layer_pairs,    
+      std::map<string, int>& specified_layer_blob_name_to_index,      
+      std::map<string, int>& inplace_blob_name_to_index,
+      vector<string>& each_blob_list,
+      const NetParameter& param);
+
+  static void ParseNetInplaceStatus(
+      std::map<string, int>& inplace_blob_name_to_index,
+      std::map<string, int>& specified_layer_blob_name_to_index,      
+      vector<vector<string>>& specified_layer_input_blob_names,
+      NetParameter* param, const string& specified_layer_type);
+
   /// @brief return whether NetState state meets NetStateRule rule
   static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
       const string& layer_name);
diff --git a/include/mpi.h b/include/mpi.h
new file mode 100644
index 000000000..a437eb9b5
--- /dev/null
+++ b/include/mpi.h
@@ -0,0 +1,2286 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*  
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ */
+/* src/include/mpi.h.  Generated from mpi.h.in by configure. */
+#ifndef MPI_INCLUDED
+#define MPI_INCLUDED
+
+/* user include file for MPI programs */
+
+/* Keep C++ compilers from getting confused */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define NO_TAGS_WITH_MODIFIERS 1
+#undef MPICH_DEFINE_ATTR_TYPE_TYPES
+#if defined(__has_attribute)
+#  if __has_attribute(pointer_with_type_tag) && \
+      __has_attribute(type_tag_for_datatype) && \
+      !defined(NO_TAGS_WITH_MODIFIERS) &&\
+      !defined(MPICH_NO_ATTR_TYPE_TAGS)
+#    define MPICH_DEFINE_ATTR_TYPE_TYPES 1
+#    define MPICH_ATTR_POINTER_WITH_TYPE_TAG(buffer_idx, type_idx)  __attribute__((pointer_with_type_tag(MPI,buffer_idx,type_idx)))
+#    define MPICH_ATTR_TYPE_TAG(type)                               __attribute__((type_tag_for_datatype(MPI,type)))
+#    define MPICH_ATTR_TYPE_TAG_LAYOUT_COMPATIBLE(type)             __attribute__((type_tag_for_datatype(MPI,type,layout_compatible)))
+#    define MPICH_ATTR_TYPE_TAG_MUST_BE_NULL()                      __attribute__((type_tag_for_datatype(MPI,void,must_be_null)))
+#    include <stddef.h>
+#  endif
+#endif
+
+#if !defined(MPICH_ATTR_POINTER_WITH_TYPE_TAG)
+#  define MPICH_ATTR_POINTER_WITH_TYPE_TAG(buffer_idx, type_idx)
+#  define MPICH_ATTR_TYPE_TAG(type)
+#  define MPICH_ATTR_TYPE_TAG_LAYOUT_COMPATIBLE(type)
+#  define MPICH_ATTR_TYPE_TAG_MUST_BE_NULL()
+#endif
+
+#if !defined(INT8_C)
+/* stdint.h was not included, see if we can get it */
+#  if defined(__cplusplus)
+#    if __cplusplus >= 201103
+#      include <cstdint>
+#    endif
+#  endif
+#endif
+
+#if !defined(INT8_C)
+/* stdint.h was not included, see if we can get it */
+#  if defined(__STDC_VERSION__)
+#    if __STDC_VERSION__ >= 199901
+#      include <stdint.h>
+#    endif
+#  endif
+#endif
+
+#if defined(INT8_C)
+/* stdint.h was included, so we can annotate these types */
+#  define MPICH_ATTR_TYPE_TAG_STDINT(type) MPICH_ATTR_TYPE_TAG(type)
+#else
+#  define MPICH_ATTR_TYPE_TAG_STDINT(type)
+#endif
+
+#ifdef __STDC_VERSION__ 
+#if __STDC_VERSION__ >= 199901
+#  define MPICH_ATTR_TYPE_TAG_C99(type) MPICH_ATTR_TYPE_TAG(type)
+#else
+#  define MPICH_ATTR_TYPE_TAG_C99(type)
+#endif
+#else 
+#  define MPICH_ATTR_TYPE_TAG_C99(type)
+#endif
+
+#if defined(__cplusplus)
+#  define MPICH_ATTR_TYPE_TAG_CXX(type) MPICH_ATTR_TYPE_TAG(type)
+#else
+#  define MPICH_ATTR_TYPE_TAG_CXX(type)
+#endif
+
+
+/* Define some null objects */
+#define MPI_COMM_NULL      ((MPI_Comm)0x04000000)
+#define MPI_OP_NULL        ((MPI_Op)0x18000000)
+#define MPI_GROUP_NULL     ((MPI_Group)0x08000000)
+#define MPI_DATATYPE_NULL  ((MPI_Datatype)0x0c000000)
+#define MPI_REQUEST_NULL   ((MPI_Request)0x2c000000)
+#define MPI_ERRHANDLER_NULL ((MPI_Errhandler)0x14000000)
+#define MPI_MESSAGE_NULL   ((MPI_Message)0x2c000000)
+#define MPI_MESSAGE_NO_PROC ((MPI_Message)0x6c000000)
+
+/* Results of the compare operations. */
+#define MPI_IDENT     0
+#define MPI_CONGRUENT 1
+#define MPI_SIMILAR   2
+#define MPI_UNEQUAL   3
+
+typedef int MPI_Datatype;
+#define MPI_CHAR           ((MPI_Datatype)0x4c000101)
+#define MPI_SIGNED_CHAR    ((MPI_Datatype)0x4c000118)
+#define MPI_UNSIGNED_CHAR  ((MPI_Datatype)0x4c000102)
+#define MPI_BYTE           ((MPI_Datatype)0x4c00010d)
+#define MPI_WCHAR          ((MPI_Datatype)0x4c00040e)
+#define MPI_SHORT          ((MPI_Datatype)0x4c000203)
+#define MPI_UNSIGNED_SHORT ((MPI_Datatype)0x4c000204)
+#define MPI_INT            ((MPI_Datatype)0x4c000405)
+#define MPI_UNSIGNED       ((MPI_Datatype)0x4c000406)
+#define MPI_LONG           ((MPI_Datatype)0x4c000807)
+#define MPI_UNSIGNED_LONG  ((MPI_Datatype)0x4c000808)
+#define MPI_FLOAT          ((MPI_Datatype)0x4c00040a)
+#define MPI_DOUBLE         ((MPI_Datatype)0x4c00080b)
+#define MPI_LONG_DOUBLE    ((MPI_Datatype)0x4c00100c)
+#define MPI_LONG_LONG_INT  ((MPI_Datatype)0x4c000809)
+#define MPI_UNSIGNED_LONG_LONG ((MPI_Datatype)0x4c000819)
+#define MPI_LONG_LONG      MPI_LONG_LONG_INT
+
+#ifdef MPICH_DEFINE_ATTR_TYPE_TYPES
+static const MPI_Datatype mpich_mpi_char               MPICH_ATTR_TYPE_TAG(char)               = MPI_CHAR;
+static const MPI_Datatype mpich_mpi_signed_char        MPICH_ATTR_TYPE_TAG(signed char)        = MPI_SIGNED_CHAR;
+static const MPI_Datatype mpich_mpi_unsigned_char      MPICH_ATTR_TYPE_TAG(unsigned char)      = MPI_UNSIGNED_CHAR;
+/*static const MPI_Datatype mpich_mpi_byte               MPICH_ATTR_TYPE_TAG(char)               = MPI_BYTE;*/
+static const MPI_Datatype mpich_mpi_wchar              MPICH_ATTR_TYPE_TAG(wchar_t)            = MPI_WCHAR;
+static const MPI_Datatype mpich_mpi_short              MPICH_ATTR_TYPE_TAG(short)              = MPI_SHORT;
+static const MPI_Datatype mpich_mpi_unsigned_short     MPICH_ATTR_TYPE_TAG(unsigned short)     = MPI_UNSIGNED_SHORT;
+static const MPI_Datatype mpich_mpi_int                MPICH_ATTR_TYPE_TAG(int)                = MPI_INT;
+static const MPI_Datatype mpich_mpi_unsigned           MPICH_ATTR_TYPE_TAG(unsigned)           = MPI_UNSIGNED;
+static const MPI_Datatype mpich_mpi_long               MPICH_ATTR_TYPE_TAG(long)               = MPI_LONG;
+static const MPI_Datatype mpich_mpi_unsigned_long      MPICH_ATTR_TYPE_TAG(unsigned long)      = MPI_UNSIGNED_LONG;
+static const MPI_Datatype mpich_mpi_float              MPICH_ATTR_TYPE_TAG(float)              = MPI_FLOAT;
+static const MPI_Datatype mpich_mpi_double             MPICH_ATTR_TYPE_TAG(double)             = MPI_DOUBLE;
+#if 0x4c00100c != 0x0c000000
+static const MPI_Datatype mpich_mpi_long_double        MPICH_ATTR_TYPE_TAG(long double)        = MPI_LONG_DOUBLE;
+#endif
+static const MPI_Datatype mpich_mpi_long_long_int      MPICH_ATTR_TYPE_TAG(long long int)      = MPI_LONG_LONG_INT;
+static const MPI_Datatype mpich_mpi_unsigned_long_long MPICH_ATTR_TYPE_TAG(unsigned long long) = MPI_UNSIGNED_LONG_LONG;
+#endif
+
+#define MPI_PACKED         ((MPI_Datatype)0x4c00010f)
+#define MPI_LB             ((MPI_Datatype)0x4c000010)
+#define MPI_UB             ((MPI_Datatype)0x4c000011)
+
+/* 
+   The layouts for the types MPI_DOUBLE_INT etc are simply
+   struct { 
+       double var;
+       int    loc;
+   }
+   This is documented in the man pages on the various datatypes.   
+ */
+#define MPI_FLOAT_INT         ((MPI_Datatype)0x8c000000)
+#define MPI_DOUBLE_INT        ((MPI_Datatype)0x8c000001)
+#define MPI_LONG_INT          ((MPI_Datatype)0x8c000002)
+#define MPI_SHORT_INT         ((MPI_Datatype)0x8c000003)
+#define MPI_2INT              ((MPI_Datatype)0x4c000816)
+#define MPI_LONG_DOUBLE_INT   ((MPI_Datatype)0x8c000004)
+
+#ifdef MPICH_DEFINE_ATTR_TYPE_TYPES
+struct mpich_struct_mpi_float_int       { float f; int i; };
+struct mpich_struct_mpi_double_int      { double d; int i; };
+struct mpich_struct_mpi_long_int        { long l; int i; };
+struct mpich_struct_mpi_short_int       { short s; int i; };
+struct mpich_struct_mpi_2int            { int i1; int i2; };
+#if 0x8c000004 != 0x0c000000
+struct mpich_struct_mpi_long_double_int { long double ld; int i; };
+#endif
+
+static const MPI_Datatype mpich_mpi_float_int       MPICH_ATTR_TYPE_TAG_LAYOUT_COMPATIBLE(struct mpich_struct_mpi_float_int)       = MPI_FLOAT_INT;
+static const MPI_Datatype mpich_mpi_double_int      MPICH_ATTR_TYPE_TAG_LAYOUT_COMPATIBLE(struct mpich_struct_mpi_double_int)      = MPI_DOUBLE_INT;
+static const MPI_Datatype mpich_mpi_long_int        MPICH_ATTR_TYPE_TAG_LAYOUT_COMPATIBLE(struct mpich_struct_mpi_long_int)        = MPI_LONG_INT;
+static const MPI_Datatype mpich_mpi_short_int       MPICH_ATTR_TYPE_TAG_LAYOUT_COMPATIBLE(struct mpich_struct_mpi_short_int)       = MPI_SHORT_INT;
+
+/*
+ * The MPI_2INT line is commented out because currently Clang 3.3 flags
+ * struct {int i1; int i2;} as different from int[2]. But actually these
+ * two types are of the same layout. Clang gives a type mismatch warning
+ * for a definitely correct code like the following:
+ *  int in[2], out[2];
+ *  MPI_Reduce(in, out, 1, MPI_2INT, MPI_MAXLOC, 0, MPI_COMM_WORLD);
+ *
+ * So, we disable type checking for MPI_2INT until Clang fixes this bug.
+ */
+
+/* static const MPI_Datatype mpich_mpi_2int            MPICH_ATTR_TYPE_TAG_LAYOUT_COMPATIBLE(struct mpich_struct_mpi_2int)            = MPI_2INT
+ */
+
+#if 0x8c000004 != 0x0c000000
+static const MPI_Datatype mpich_mpi_long_double_int MPICH_ATTR_TYPE_TAG_LAYOUT_COMPATIBLE(struct mpich_struct_mpi_long_double_int) = MPI_LONG_DOUBLE_INT;
+#endif
+#endif
+
+/* Fortran types */
+#define MPI_COMPLEX           ((MPI_Datatype)1275070494)
+#define MPI_DOUBLE_COMPLEX    ((MPI_Datatype)1275072546)
+#define MPI_LOGICAL           ((MPI_Datatype)1275069469)
+#define MPI_REAL              ((MPI_Datatype)1275069468)
+#define MPI_DOUBLE_PRECISION  ((MPI_Datatype)1275070495)
+#define MPI_INTEGER           ((MPI_Datatype)1275069467)
+#define MPI_2INTEGER          ((MPI_Datatype)1275070496)
+/* 
+ * MPI_2COMPLEX and MPI_2DOUBLE_COMPLEX were defined by accident in 
+ * MPI 1.0 and removed in MPI 1.1.  
+ *
+ * This definition provides backward compatibility.  These definitions
+ * will be removed in a subsequent MPICH release
+ */
+#ifdef MPICH_DEFINE_2COMPLEX
+#define MPI_2COMPLEX          ((MPI_Datatype)1275072548)
+#define MPI_2DOUBLE_COMPLEX   ((MPI_Datatype)1275076645)
+#endif 
+#define MPI_2REAL             ((MPI_Datatype)1275070497)
+#define MPI_2DOUBLE_PRECISION ((MPI_Datatype)1275072547)
+#define MPI_CHARACTER         ((MPI_Datatype)1275068698)
+
+/* Size-specific types (see MPI-2, 10.2.5) */
+#define MPI_REAL4             ((MPI_Datatype)0x4c000427)
+#define MPI_REAL8             ((MPI_Datatype)0x4c000829)
+#define MPI_REAL16            ((MPI_Datatype)0x4c00102b)
+#define MPI_COMPLEX8          ((MPI_Datatype)0x4c000828)
+#define MPI_COMPLEX16         ((MPI_Datatype)0x4c00102a)
+#define MPI_COMPLEX32         ((MPI_Datatype)0x4c00202c)
+#define MPI_INTEGER1          ((MPI_Datatype)0x4c00012d)
+#define MPI_INTEGER2          ((MPI_Datatype)0x4c00022f)
+#define MPI_INTEGER4          ((MPI_Datatype)0x4c000430)
+#define MPI_INTEGER8          ((MPI_Datatype)0x4c000831)
+#define MPI_INTEGER16         ((MPI_Datatype)MPI_DATATYPE_NULL)
+
+/* C99 fixed-width datatypes */
+#define MPI_INT8_T            ((MPI_Datatype)0x4c000137)
+#define MPI_INT16_T           ((MPI_Datatype)0x4c000238)
+#define MPI_INT32_T           ((MPI_Datatype)0x4c000439)
+#define MPI_INT64_T           ((MPI_Datatype)0x4c00083a)
+#define MPI_UINT8_T           ((MPI_Datatype)0x4c00013b)
+#define MPI_UINT16_T          ((MPI_Datatype)0x4c00023c)
+#define MPI_UINT32_T          ((MPI_Datatype)0x4c00043d)
+#define MPI_UINT64_T          ((MPI_Datatype)0x4c00083e)
+
+#ifdef MPICH_DEFINE_ATTR_TYPE_TYPES
+static const MPI_Datatype mpich_mpi_int8_t   MPICH_ATTR_TYPE_TAG_STDINT(int8_t)   = MPI_INT8_T;
+static const MPI_Datatype mpich_mpi_int16_t  MPICH_ATTR_TYPE_TAG_STDINT(int16_t)  = MPI_INT16_T;
+static const MPI_Datatype mpich_mpi_int32_t  MPICH_ATTR_TYPE_TAG_STDINT(int32_t)  = MPI_INT32_T;
+static const MPI_Datatype mpich_mpi_int64_t  MPICH_ATTR_TYPE_TAG_STDINT(int64_t)  = MPI_INT64_T;
+static const MPI_Datatype mpich_mpi_uint8_t  MPICH_ATTR_TYPE_TAG_STDINT(uint8_t)  = MPI_UINT8_T;
+static const MPI_Datatype mpich_mpi_uint16_t MPICH_ATTR_TYPE_TAG_STDINT(uint16_t) = MPI_UINT16_T;
+static const MPI_Datatype mpich_mpi_uint32_t MPICH_ATTR_TYPE_TAG_STDINT(uint32_t) = MPI_UINT32_T;
+static const MPI_Datatype mpich_mpi_uint64_t MPICH_ATTR_TYPE_TAG_STDINT(uint64_t) = MPI_UINT64_T;
+#endif
+
+/* other C99 types */
+#define MPI_C_BOOL                 ((MPI_Datatype)0x4c00013f)
+#define MPI_C_FLOAT_COMPLEX        ((MPI_Datatype)0x4c000840)
+#define MPI_C_COMPLEX              MPI_C_FLOAT_COMPLEX
+#define MPI_C_DOUBLE_COMPLEX       ((MPI_Datatype)0x4c001041)
+#define MPI_C_LONG_DOUBLE_COMPLEX  ((MPI_Datatype)0x4c002042)
+
+#ifdef MPICH_DEFINE_ATTR_TYPE_TYPES
+static const MPI_Datatype mpich_mpi_c_bool                MPICH_ATTR_TYPE_TAG_C99(_Bool)           = MPI_C_BOOL;
+static const MPI_Datatype mpich_mpi_c_float_complex       MPICH_ATTR_TYPE_TAG_C99(float _Complex)  = MPI_C_FLOAT_COMPLEX;
+static const MPI_Datatype mpich_mpi_c_double_complex      MPICH_ATTR_TYPE_TAG_C99(double _Complex) = MPI_C_DOUBLE_COMPLEX;
+#if 0x4c002042 != 0x0c000000
+static const MPI_Datatype mpich_mpi_c_long_double_complex MPICH_ATTR_TYPE_TAG_C99(long double _Complex) = MPI_C_LONG_DOUBLE_COMPLEX;
+#endif
+#endif
+
+/* address/offset types */
+#define MPI_AINT          ((MPI_Datatype)0x4c000843)
+#define MPI_OFFSET        ((MPI_Datatype)0x4c000844)
+#define MPI_COUNT         ((MPI_Datatype)0x4c000845)
+
+/* MPI-3 C++ types */
+#define MPI_CXX_BOOL                ((MPI_Datatype)0x4c000133)
+#define MPI_CXX_FLOAT_COMPLEX       ((MPI_Datatype)0x4c000834)
+#define MPI_CXX_DOUBLE_COMPLEX      ((MPI_Datatype)0x4c001035)
+#define MPI_CXX_LONG_DOUBLE_COMPLEX ((MPI_Datatype)0x4c002036)
+
+/* typeclasses */
+#define MPI_TYPECLASS_REAL 1
+#define MPI_TYPECLASS_INTEGER 2
+#define MPI_TYPECLASS_COMPLEX 3
+
+/* Communicators */
+typedef int MPI_Comm;
+#define MPI_COMM_WORLD ((MPI_Comm)0x44000000)
+#define MPI_COMM_SELF  ((MPI_Comm)0x44000001)
+
+/* Groups */
+typedef int MPI_Group;
+#define MPI_GROUP_EMPTY ((MPI_Group)0x48000000)
+
+/* RMA and Windows */
+typedef int MPI_Win;
+#define MPI_WIN_NULL ((MPI_Win)0x20000000)
+
+/* File and IO */
+/* This define lets ROMIO know that MPI_File has been defined */
+#define MPI_FILE_DEFINED
+/* ROMIO uses a pointer for MPI_File objects.  This must be the same definition
+   as in src/mpi/romio/include/mpio.h.in  */
+typedef struct ADIOI_FileD *MPI_File;
+#define MPI_FILE_NULL ((MPI_File)0)
+
+/* Collective operations */
+typedef int MPI_Op;
+
+#define MPI_MAX     (MPI_Op)(0x58000001)
+#define MPI_MIN     (MPI_Op)(0x58000002)
+#define MPI_SUM     (MPI_Op)(0x58000003)
+#define MPI_PROD    (MPI_Op)(0x58000004)
+#define MPI_LAND    (MPI_Op)(0x58000005)
+#define MPI_BAND    (MPI_Op)(0x58000006)
+#define MPI_LOR     (MPI_Op)(0x58000007)
+#define MPI_BOR     (MPI_Op)(0x58000008)
+#define MPI_LXOR    (MPI_Op)(0x58000009)
+#define MPI_BXOR    (MPI_Op)(0x5800000a)
+#define MPI_MINLOC  (MPI_Op)(0x5800000b)
+#define MPI_MAXLOC  (MPI_Op)(0x5800000c)
+#define MPI_REPLACE (MPI_Op)(0x5800000d)
+#define MPI_NO_OP   (MPI_Op)(0x5800000e)
+
+/* Permanent key values */
+/* C Versions (return pointer to value),
+   Fortran Versions (return integer value).
+   Handled directly by the attribute value routine
+   
+   DO NOT CHANGE THESE.  The values encode:
+   builtin kind (0x1 in bit 30-31)
+   Keyval object (0x9 in bits 26-29)
+   for communicator (0x1 in bits 22-25)
+   
+   Fortran versions of the attributes are formed by adding one to
+   the C version.
+ */
+#define MPI_TAG_UB           0x64400001
+#define MPI_HOST             0x64400003
+#define MPI_IO               0x64400005
+#define MPI_WTIME_IS_GLOBAL  0x64400007
+#define MPI_UNIVERSE_SIZE    0x64400009
+#define MPI_LASTUSEDCODE     0x6440000b
+#define MPI_APPNUM           0x6440000d
+
+/* In addition, there are 5 predefined window attributes that are
+   defined for every window */
+#define MPI_WIN_BASE          0x66000001
+#define MPI_WIN_SIZE          0x66000003
+#define MPI_WIN_DISP_UNIT     0x66000005
+#define MPI_WIN_CREATE_FLAVOR 0x66000007
+#define MPI_WIN_MODEL         0x66000009
+
+#ifdef MPICH_DEFINE_ATTR_TYPE_TYPES
+static const MPI_Datatype mpich_mpi_datatype_null MPICH_ATTR_TYPE_TAG_MUST_BE_NULL() = MPI_DATATYPE_NULL;
+#endif
+
+/* These are only guesses; make sure you change them in mpif.h as well */
+#define MPI_MAX_PROCESSOR_NAME 128
+#define MPI_MAX_LIBRARY_VERSION_STRING 8192
+#define MPI_MAX_ERROR_STRING   512
+#define MPI_MAX_PORT_NAME      256
+#define MPI_MAX_OBJECT_NAME    128
+
+/* Pre-defined constants */
+#define MPI_UNDEFINED      (-32766)
+#define MPI_KEYVAL_INVALID 0x24000000
+
+/* MPI-3 window flavors */
+typedef enum MPIR_Win_flavor {
+    MPI_WIN_FLAVOR_CREATE      = 1,
+    MPI_WIN_FLAVOR_ALLOCATE    = 2,
+    MPI_WIN_FLAVOR_DYNAMIC     = 3,
+    MPI_WIN_FLAVOR_SHARED      = 4
+} MPIR_Win_flavor_t;
+
+/* MPI-3 window consistency models */
+typedef enum MPIR_Win_model {
+    MPI_WIN_SEPARATE   = 1,
+    MPI_WIN_UNIFIED    = 2
+} MPIR_Win_model_t;
+
+/* Upper bound on the overhead in bsend for each message buffer */
+#define MPI_BSEND_OVERHEAD 96
+
+/* Topology types */
+typedef enum MPIR_Topo_type { MPI_GRAPH=1, MPI_CART=2, MPI_DIST_GRAPH=3 } MPIR_Topo_type;
+
+#define MPI_BOTTOM      (void *)0
+extern int * const MPI_UNWEIGHTED;
+extern int * const MPI_WEIGHTS_EMPTY;
+
+#define MPI_PROC_NULL   (-1)
+#define MPI_ANY_SOURCE 	(-2)
+#define MPI_ROOT        (-3)
+#define MPI_ANY_TAG     (-1)
+
+#define MPI_LOCK_EXCLUSIVE  234
+#define MPI_LOCK_SHARED     235
+
+/* C functions */
+typedef void (MPI_Handler_function) ( MPI_Comm *, int *, ... );
+typedef int (MPI_Comm_copy_attr_function)(MPI_Comm, int, void *, void *, 
+					  void *, int *);
+typedef int (MPI_Comm_delete_attr_function)(MPI_Comm, int, void *, void *);
+typedef int (MPI_Type_copy_attr_function)(MPI_Datatype, int, void *, void *, 
+					  void *, int *);
+typedef int (MPI_Type_delete_attr_function)(MPI_Datatype, int, void *, void *);
+typedef int (MPI_Win_copy_attr_function)(MPI_Win, int, void *, void *, void *,
+					 int *);
+typedef int (MPI_Win_delete_attr_function)(MPI_Win, int, void *, void *);
+/* added in MPI-2.2 */
+typedef void (MPI_Comm_errhandler_function)(MPI_Comm *, int *, ...);
+typedef void (MPI_File_errhandler_function)(MPI_File *, int *, ...);
+typedef void (MPI_Win_errhandler_function)(MPI_Win *, int *, ...);
+/* names that were added in MPI-2.0 and deprecated in MPI-2.2 */
+typedef MPI_Comm_errhandler_function MPI_Comm_errhandler_fn;
+typedef MPI_File_errhandler_function MPI_File_errhandler_fn;
+typedef MPI_Win_errhandler_function MPI_Win_errhandler_fn;
+
+/* Built in (0x1 in 30-31), errhandler (0x5 in bits 26-29, allkind (0
+   in 22-25), index in the low bits */
+#define MPI_ERRORS_ARE_FATAL ((MPI_Errhandler)0x54000000)
+#define MPI_ERRORS_RETURN    ((MPI_Errhandler)0x54000001)
+/* MPIR_ERRORS_THROW_EXCEPTIONS is not part of the MPI standard, it is here to
+   facilitate the c++ binding which has MPI::ERRORS_THROW_EXCEPTIONS. 
+   Using the MPIR prefix preserved the MPI_ names for objects defined by
+   the standard. */
+#define MPIR_ERRORS_THROW_EXCEPTIONS ((MPI_Errhandler)0x54000002)
+typedef int MPI_Errhandler;
+
+/* Make the C names for the dup function mixed case.
+   This is required for systems that use all uppercase names for Fortran 
+   externals.  */
+/* MPI 1 names */
+#define MPI_NULL_COPY_FN   ((MPI_Copy_function *)0)
+#define MPI_NULL_DELETE_FN ((MPI_Delete_function *)0)
+#define MPI_DUP_FN         MPIR_Dup_fn
+/* MPI 2 names */
+#define MPI_COMM_NULL_COPY_FN ((MPI_Comm_copy_attr_function*)0)
+#define MPI_COMM_NULL_DELETE_FN ((MPI_Comm_delete_attr_function*)0)
+#define MPI_COMM_DUP_FN  ((MPI_Comm_copy_attr_function *)MPI_DUP_FN)
+#define MPI_WIN_NULL_COPY_FN ((MPI_Win_copy_attr_function*)0)
+#define MPI_WIN_NULL_DELETE_FN ((MPI_Win_delete_attr_function*)0)
+#define MPI_WIN_DUP_FN   ((MPI_Win_copy_attr_function*)MPI_DUP_FN)
+#define MPI_TYPE_NULL_COPY_FN ((MPI_Type_copy_attr_function*)0)
+#define MPI_TYPE_NULL_DELETE_FN ((MPI_Type_delete_attr_function*)0)
+#define MPI_TYPE_DUP_FN ((MPI_Type_copy_attr_function*)MPI_DUP_FN)
+
+/* MPI request opjects */
+typedef int MPI_Request;
+
+/* MPI message objects for Mprobe and related functions */
+typedef int MPI_Message;
+
+/* User combination function */
+typedef void (MPI_User_function) ( void *, void *, int *, MPI_Datatype * ); 
+
+/* MPI Attribute copy and delete functions */
+typedef int (MPI_Copy_function) ( MPI_Comm, int, void *, void *, void *, int * );
+typedef int (MPI_Delete_function) ( MPI_Comm, int, void *, void * );
+
+#define MPI_VERSION    3
+#define MPI_SUBVERSION 1
+#define MPICH_NAME     3
+#define MPICH         1
+#define MPICH_HAS_C2F  1
+
+
+/* MPICH_VERSION is the version string. MPICH_NUMVERSION is the
+ * numeric version that can be used in numeric comparisons.
+ *
+ * MPICH_VERSION uses the following format:
+ * Version: [MAJ].[MIN].[REV][EXT][EXT_NUMBER]
+ * Example: 1.0.7rc1 has
+ *          MAJ = 1
+ *          MIN = 0
+ *          REV = 7
+ *          EXT = rc
+ *          EXT_NUMBER = 1
+ *
+ * MPICH_NUMVERSION will convert EXT to a format number:
+ *          ALPHA (a) = 0
+ *          BETA (b)  = 1
+ *          RC (rc)   = 2
+ *          PATCH (p) = 3
+ * Regular releases are treated as patch 0
+ *
+ * Numeric version will have 1 digit for MAJ, 2 digits for MIN, 2
+ * digits for REV, 1 digit for EXT and 2 digits for EXT_NUMBER. So,
+ * 1.0.7rc1 will have the numeric version 10007201.
+ */
+#define MPICH_VERSION "3.2"
+#define MPICH_NUMVERSION 30200300
+
+#define MPICH_RELEASE_TYPE_ALPHA  0
+#define MPICH_RELEASE_TYPE_BETA   1
+#define MPICH_RELEASE_TYPE_RC     2
+#define MPICH_RELEASE_TYPE_PATCH  3
+
+#define MPICH_CALC_VERSION(MAJOR, MINOR, REVISION, TYPE, PATCH) \
+    (((MAJOR) * 10000000) + ((MINOR) * 100000) + ((REVISION) * 1000) + ((TYPE) * 100) + (PATCH))
+
+/* for the datatype decoders */
+enum MPIR_Combiner_enum {
+    MPI_COMBINER_NAMED            = 1,
+    MPI_COMBINER_DUP              = 2,
+    MPI_COMBINER_CONTIGUOUS       = 3, 
+    MPI_COMBINER_VECTOR           = 4,
+    MPI_COMBINER_HVECTOR_INTEGER  = 5,
+    MPI_COMBINER_HVECTOR          = 6,
+    MPI_COMBINER_INDEXED          = 7,
+    MPI_COMBINER_HINDEXED_INTEGER = 8, 
+    MPI_COMBINER_HINDEXED         = 9, 
+    MPI_COMBINER_INDEXED_BLOCK    = 10, 
+    MPI_COMBINER_STRUCT_INTEGER   = 11,
+    MPI_COMBINER_STRUCT           = 12,
+    MPI_COMBINER_SUBARRAY         = 13,
+    MPI_COMBINER_DARRAY           = 14,
+    MPI_COMBINER_F90_REAL         = 15,
+    MPI_COMBINER_F90_COMPLEX      = 16,
+    MPI_COMBINER_F90_INTEGER      = 17,
+    MPI_COMBINER_RESIZED          = 18,
+    MPI_COMBINER_HINDEXED_BLOCK   = 19
+};
+
+/* for info */
+typedef int MPI_Info;
+#define MPI_INFO_NULL         ((MPI_Info)0x1c000000)
+#define MPI_INFO_ENV          ((MPI_Info)0x5c000001)
+#define MPI_MAX_INFO_KEY       255
+#define MPI_MAX_INFO_VAL      1024
+
+/* for subarray and darray constructors */
+#define MPI_ORDER_C              56
+#define MPI_ORDER_FORTRAN        57
+#define MPI_DISTRIBUTE_BLOCK    121
+#define MPI_DISTRIBUTE_CYCLIC   122
+#define MPI_DISTRIBUTE_NONE     123
+#define MPI_DISTRIBUTE_DFLT_DARG -49767
+
+#define MPI_IN_PLACE  (void *) -1
+
+/* asserts for one-sided communication */
+#define MPI_MODE_NOCHECK      1024
+#define MPI_MODE_NOSTORE      2048
+#define MPI_MODE_NOPUT        4096
+#define MPI_MODE_NOPRECEDE    8192
+#define MPI_MODE_NOSUCCEED   16384 
+
+/* predefined types for MPI_Comm_split_type */
+#define MPI_COMM_TYPE_SHARED    1
+
+/* Definitions that are determined by configure. */
+typedef long MPI_Aint;
+typedef int MPI_Fint;
+typedef long long MPI_Count;
+
+#ifdef MPICH_DEFINE_ATTR_TYPE_TYPES
+static const MPI_Datatype mpich_mpi_aint   MPICH_ATTR_TYPE_TAG(MPI_Aint)   = MPI_AINT;
+#endif
+
+/* FIXME: The following two definition are not defined by MPI and must not be
+   included in the mpi.h file, as the MPI namespace is reserved to the MPI 
+   standard */
+#define MPI_AINT_FMT_DEC_SPEC "%ld"
+#define MPI_AINT_FMT_HEX_SPEC "%lx"
+
+/* Let ROMIO know that MPI_Offset is already defined */
+#define HAVE_MPI_OFFSET
+/* MPI_OFFSET_TYPEDEF is set in configure and is 
+      typedef $MPI_OFFSET MPI_Offset;
+   where $MPI_OFFSET is the correct C type */
+typedef long long MPI_Offset;
+
+#ifdef MPICH_DEFINE_ATTR_TYPE_TYPES
+static const MPI_Datatype mpich_mpi_offset MPICH_ATTR_TYPE_TAG(MPI_Offset) = MPI_OFFSET;
+#endif
+
+/* The order of these elements must match that in mpif.h, mpi_f08_types.f90,
+   and mpi_c_interface_types.f90 */
+typedef struct MPI_Status {
+    int count_lo;
+    int count_hi_and_cancelled;
+    int MPI_SOURCE;
+    int MPI_TAG;
+    int MPI_ERROR;
+} MPI_Status;
+
+/* types for the MPI_T_ interface */
+struct MPIR_T_enum_s;
+struct MPIR_T_cvar_handle_s;
+struct MPIR_T_pvar_handle_s;
+struct MPIR_T_pvar_session_s;
+
+typedef struct MPIR_T_enum_s * MPI_T_enum;
+typedef struct MPIR_T_cvar_handle_s * MPI_T_cvar_handle;
+typedef struct MPIR_T_pvar_handle_s * MPI_T_pvar_handle;
+typedef struct MPIR_T_pvar_session_s * MPI_T_pvar_session;
+
+/* extra const at front would be safer, but is incompatible with MPI_T_ prototypes */
+extern struct MPIR_T_pvar_handle_s * const MPI_T_PVAR_ALL_HANDLES;
+
+#define MPI_T_ENUM_NULL         ((MPI_T_enum)NULL)
+#define MPI_T_CVAR_HANDLE_NULL  ((MPI_T_cvar_handle)NULL)
+#define MPI_T_PVAR_HANDLE_NULL  ((MPI_T_pvar_handle)NULL)
+#define MPI_T_PVAR_SESSION_NULL ((MPI_T_pvar_session)NULL)
+
+/* the MPI_T_ interface requires that these VERBOSITY constants occur in this
+ * relative order with increasing values */
+typedef enum MPIR_T_verbosity_t {
+    /* don't name-shift this if/when MPI_T_ is accepted, this is an MPICH-only
+     * extension */
+    MPIX_T_VERBOSITY_INVALID = 0,
+
+    /* arbitrarily shift values to aid debugging and reduce accidental errors */
+    MPI_T_VERBOSITY_USER_BASIC = 221,
+    MPI_T_VERBOSITY_USER_DETAIL,
+    MPI_T_VERBOSITY_USER_ALL,
+
+    MPI_T_VERBOSITY_TUNER_BASIC,
+    MPI_T_VERBOSITY_TUNER_DETAIL,
+    MPI_T_VERBOSITY_TUNER_ALL,
+
+    MPI_T_VERBOSITY_MPIDEV_BASIC,
+    MPI_T_VERBOSITY_MPIDEV_DETAIL,
+    MPI_T_VERBOSITY_MPIDEV_ALL
+} MPIR_T_verbosity_t;
+
+typedef enum MPIR_T_bind_t {
+    /* don't name-shift this if/when MPI_T_ is accepted, this is an MPICH-only
+     * extension */
+    MPIX_T_BIND_INVALID = 0,
+
+    /* arbitrarily shift values to aid debugging and reduce accidental errors */
+    MPI_T_BIND_NO_OBJECT = 9700,
+    MPI_T_BIND_MPI_COMM,
+    MPI_T_BIND_MPI_DATATYPE,
+    MPI_T_BIND_MPI_ERRHANDLER,
+    MPI_T_BIND_MPI_FILE,
+    MPI_T_BIND_MPI_GROUP,
+    MPI_T_BIND_MPI_OP,
+    MPI_T_BIND_MPI_REQUEST,
+    MPI_T_BIND_MPI_WIN,
+    MPI_T_BIND_MPI_MESSAGE,
+    MPI_T_BIND_MPI_INFO
+} MPIR_T_bind_t;
+
+typedef enum MPIR_T_scope_t {
+    /* don't name-shift this if/when MPI_T_ is accepted, this is an MPICH-only
+     * extension */
+    MPIX_T_SCOPE_INVALID = 0,
+
+    /* arbitrarily shift values to aid debugging and reduce accidental errors */
+    MPI_T_SCOPE_CONSTANT = 60438,
+    MPI_T_SCOPE_READONLY,
+    MPI_T_SCOPE_LOCAL,
+    MPI_T_SCOPE_GROUP,
+    MPI_T_SCOPE_GROUP_EQ,
+    MPI_T_SCOPE_ALL,
+    MPI_T_SCOPE_ALL_EQ
+} MPIR_T_scope_t;
+
+typedef enum MPIR_T_pvar_class_t {
+    /* don't name-shift this if/when MPI_T_ is accepted, this is an MPICH-only
+     * extension */
+    MPIX_T_PVAR_CLASS_INVALID = 0,
+
+    /* arbitrarily shift values to aid debugging and reduce accidental errors */
+    MPIR_T_PVAR_CLASS_FIRST = 240,
+    MPI_T_PVAR_CLASS_STATE = MPIR_T_PVAR_CLASS_FIRST,
+    MPI_T_PVAR_CLASS_LEVEL,
+    MPI_T_PVAR_CLASS_SIZE,
+    MPI_T_PVAR_CLASS_PERCENTAGE,
+    MPI_T_PVAR_CLASS_HIGHWATERMARK,
+    MPI_T_PVAR_CLASS_LOWWATERMARK,
+    MPI_T_PVAR_CLASS_COUNTER,
+    MPI_T_PVAR_CLASS_AGGREGATE,
+    MPI_T_PVAR_CLASS_TIMER,
+    MPI_T_PVAR_CLASS_GENERIC,
+    MPIR_T_PVAR_CLASS_LAST,
+    MPIR_T_PVAR_CLASS_NUMBER = MPIR_T_PVAR_CLASS_LAST - MPIR_T_PVAR_CLASS_FIRST
+} MPIR_T_pvar_class_t;
+
+/* Handle conversion types/functions */
+
+/* Programs that need to convert types used in MPICH should use these */
+#define MPI_Comm_c2f(comm) (MPI_Fint)(comm)
+#define MPI_Comm_f2c(comm) (MPI_Comm)(comm)
+#define MPI_Type_c2f(datatype) (MPI_Fint)(datatype)
+#define MPI_Type_f2c(datatype) (MPI_Datatype)(datatype)
+#define MPI_Group_c2f(group) (MPI_Fint)(group)
+#define MPI_Group_f2c(group) (MPI_Group)(group)
+#define MPI_Info_c2f(info) (MPI_Fint)(info)
+#define MPI_Info_f2c(info) (MPI_Info)(info)
+#define MPI_Request_f2c(request) (MPI_Request)(request)
+#define MPI_Request_c2f(request) (MPI_Fint)(request)
+#define MPI_Op_c2f(op) (MPI_Fint)(op)
+#define MPI_Op_f2c(op) (MPI_Op)(op)
+#define MPI_Errhandler_c2f(errhandler) (MPI_Fint)(errhandler)
+#define MPI_Errhandler_f2c(errhandler) (MPI_Errhandler)(errhandler)
+#define MPI_Win_c2f(win)   (MPI_Fint)(win)
+#define MPI_Win_f2c(win)   (MPI_Win)(win)
+#define MPI_Message_c2f(msg) ((MPI_Fint)(msg))
+#define MPI_Message_f2c(msg) ((MPI_Message)(msg))
+
+/* PMPI versions of the handle transfer functions.  See section 4.17 */
+#define PMPI_Comm_c2f(comm) (MPI_Fint)(comm)
+#define PMPI_Comm_f2c(comm) (MPI_Comm)(comm)
+#define PMPI_Type_c2f(datatype) (MPI_Fint)(datatype)
+#define PMPI_Type_f2c(datatype) (MPI_Datatype)(datatype)
+#define PMPI_Group_c2f(group) (MPI_Fint)(group)
+#define PMPI_Group_f2c(group) (MPI_Group)(group)
+#define PMPI_Info_c2f(info) (MPI_Fint)(info)
+#define PMPI_Info_f2c(info) (MPI_Info)(info)
+#define PMPI_Request_f2c(request) (MPI_Request)(request)
+#define PMPI_Request_c2f(request) (MPI_Fint)(request)
+#define PMPI_Op_c2f(op) (MPI_Fint)(op)
+#define PMPI_Op_f2c(op) (MPI_Op)(op)
+#define PMPI_Errhandler_c2f(errhandler) (MPI_Fint)(errhandler)
+#define PMPI_Errhandler_f2c(errhandler) (MPI_Errhandler)(errhandler)
+#define PMPI_Win_c2f(win)   (MPI_Fint)(win)
+#define PMPI_Win_f2c(win)   (MPI_Win)(win)
+#define PMPI_Message_c2f(msg) ((MPI_Fint)(msg))
+#define PMPI_Message_f2c(msg) ((MPI_Message)(msg))
+
+#define MPI_STATUS_IGNORE (MPI_Status *)1
+#define MPI_STATUSES_IGNORE (MPI_Status *)1
+#define MPI_ERRCODES_IGNORE (int *)0
+
+/* See 4.12.5 for MPI_F_STATUS(ES)_IGNORE */
+#define MPIU_DLL_SPEC
+extern MPIU_DLL_SPEC MPI_Fint * MPI_F_STATUS_IGNORE;
+extern MPIU_DLL_SPEC MPI_Fint * MPI_F_STATUSES_IGNORE;
+/* The annotation MPIU_DLL_SPEC to the extern statements is used 
+   as a hook for systems that require C extensions to correctly construct
+   DLLs, and is defined as an empty string otherwise
+ */
+
+/* The MPI standard requires that the ARGV_NULL values be the same as
+   NULL (see 5.3.2) */
+#define MPI_ARGV_NULL (char **)0
+#define MPI_ARGVS_NULL (char ***)0
+
+/* C type for MPI_STATUS in F08.
+   The field order should match that in mpi_f08_types.f90, and mpi_c_interface_types.f90.
+ */
+typedef struct {
+    MPI_Fint count_lo;
+    MPI_Fint count_hi_and_cancelled;
+    MPI_Fint MPI_SOURCE;
+    MPI_Fint MPI_TAG;
+    MPI_Fint MPI_ERROR;
+} MPI_F08_Status;
+
+extern MPI_F08_Status MPIR_F08_MPI_STATUS_IGNORE_OBJ;
+extern MPI_F08_Status MPIR_F08_MPI_STATUSES_IGNORE_OBJ[1];
+extern int MPIR_F08_MPI_IN_PLACE;
+extern int MPIR_F08_MPI_BOTTOM;
+
+/* Pointers to above objects */
+extern MPI_F08_Status *MPI_F08_STATUS_IGNORE;
+extern MPI_F08_Status *MPI_F08_STATUSES_IGNORE;
+
+/* For supported thread levels */
+#define MPI_THREAD_SINGLE 0
+#define MPI_THREAD_FUNNELED 1
+#define MPI_THREAD_SERIALIZED 2
+#define MPI_THREAD_MULTIPLE 3
+
+/* Typedefs for generalized requests */
+typedef int (MPI_Grequest_cancel_function)(void *, int); 
+typedef int (MPI_Grequest_free_function)(void *); 
+typedef int (MPI_Grequest_query_function)(void *, MPI_Status *); 
+typedef int (MPIX_Grequest_poll_function)(void *, MPI_Status *);
+typedef int (MPIX_Grequest_wait_function)(int, void **, double, MPI_Status *);
+
+/* MPI's error classes */
+#define MPI_SUCCESS          0      /* Successful return code */
+/* Communication argument parameters */
+#define MPI_ERR_BUFFER       1      /* Invalid buffer pointer */
+#define MPI_ERR_COUNT        2      /* Invalid count argument */
+#define MPI_ERR_TYPE         3      /* Invalid datatype argument */
+#define MPI_ERR_TAG          4      /* Invalid tag argument */
+#define MPI_ERR_COMM         5      /* Invalid communicator */
+#define MPI_ERR_RANK         6      /* Invalid rank */
+#define MPI_ERR_ROOT         7      /* Invalid root */
+#define MPI_ERR_TRUNCATE    14      /* Message truncated on receive */
+
+/* MPI Objects (other than COMM) */
+#define MPI_ERR_GROUP        8      /* Invalid group */
+#define MPI_ERR_OP           9      /* Invalid operation */
+#define MPI_ERR_REQUEST     19      /* Invalid mpi_request handle */
+
+/* Special topology argument parameters */
+#define MPI_ERR_TOPOLOGY    10      /* Invalid topology */
+#define MPI_ERR_DIMS        11      /* Invalid dimension argument */
+
+/* All other arguments.  This is a class with many kinds */
+#define MPI_ERR_ARG         12      /* Invalid argument */
+
+/* Other errors that are not simply an invalid argument */
+#define MPI_ERR_OTHER       15      /* Other error; use Error_string */
+
+#define MPI_ERR_UNKNOWN     13      /* Unknown error */
+#define MPI_ERR_INTERN      16      /* Internal error code    */
+
+/* Multiple completion has three special error classes */
+#define MPI_ERR_IN_STATUS           17      /* Look in status for error value */
+#define MPI_ERR_PENDING             18      /* Pending request */
+
+/* New MPI-2 Error classes */
+#define MPI_ERR_ACCESS      20      /* */
+#define MPI_ERR_AMODE       21      /* */
+#define MPI_ERR_BAD_FILE    22      /* */
+#define MPI_ERR_CONVERSION  23      /* */
+#define MPI_ERR_DUP_DATAREP 24      /* */
+#define MPI_ERR_FILE_EXISTS 25      /* */
+#define MPI_ERR_FILE_IN_USE 26      /* */
+#define MPI_ERR_FILE        27      /* */
+#define MPI_ERR_IO          32      /* */
+#define MPI_ERR_NO_SPACE    36      /* */
+#define MPI_ERR_NO_SUCH_FILE 37     /* */
+#define MPI_ERR_READ_ONLY   40      /* */
+#define MPI_ERR_UNSUPPORTED_DATAREP   43  /* */
+
+/* MPI_ERR_INFO is NOT defined in the MPI-2 standard.  I believe that
+   this is an oversight */
+#define MPI_ERR_INFO        28      /* */
+#define MPI_ERR_INFO_KEY    29      /* */
+#define MPI_ERR_INFO_VALUE  30      /* */
+#define MPI_ERR_INFO_NOKEY  31      /* */
+
+#define MPI_ERR_NAME        33      /* */
+#define MPI_ERR_NO_MEM      34      /* Alloc_mem could not allocate memory */
+#define MPI_ERR_NOT_SAME    35      /* */
+#define MPI_ERR_PORT        38      /* */
+#define MPI_ERR_QUOTA       39      /* */
+#define MPI_ERR_SERVICE     41      /* */
+#define MPI_ERR_SPAWN       42      /* */
+#define MPI_ERR_UNSUPPORTED_OPERATION 44 /* */
+#define MPI_ERR_WIN         45      /* */
+
+#define MPI_ERR_BASE        46      /* */
+#define MPI_ERR_LOCKTYPE    47      /* */
+#define MPI_ERR_KEYVAL      48      /* Erroneous attribute key */
+#define MPI_ERR_RMA_CONFLICT 49     /* */
+#define MPI_ERR_RMA_SYNC    50      /* */ 
+#define MPI_ERR_SIZE        51      /* */
+#define MPI_ERR_DISP        52      /* */
+#define MPI_ERR_ASSERT      53      /* */
+
+#define MPI_ERR_RMA_RANGE  55       /* */
+#define MPI_ERR_RMA_ATTACH 56       /* */
+#define MPI_ERR_RMA_SHARED 57       /* */
+#define MPI_ERR_RMA_FLAVOR 58       /* */
+
+/* Return codes for functions in the MPI Tool Information Interface */
+#define MPI_T_ERR_MEMORY            59  /* Out of memory */
+#define MPI_T_ERR_NOT_INITIALIZED   60  /* Interface not initialized */
+#define MPI_T_ERR_CANNOT_INIT       61  /* Interface not in the state to
+                                           be initialized */
+#define MPI_T_ERR_INVALID_INDEX     62  /* The index is invalid or
+                                           has been deleted  */
+#define MPI_T_ERR_INVALID_ITEM      63  /* Item index queried is out of range */
+#define MPI_T_ERR_INVALID_HANDLE    64  /* The handle is invalid */
+#define MPI_T_ERR_OUT_OF_HANDLES    65  /* No more handles available */
+#define MPI_T_ERR_OUT_OF_SESSIONS   66  /* No more sessions available */
+#define MPI_T_ERR_INVALID_SESSION   67  /* Session argument is not valid */
+#define MPI_T_ERR_CVAR_SET_NOT_NOW  68  /* Cvar can't be set at this moment */
+#define MPI_T_ERR_CVAR_SET_NEVER    69  /* Cvar can't be set until
+                                           end of execution */
+#define MPI_T_ERR_PVAR_NO_STARTSTOP 70  /* Pvar can't be started or stopped */
+#define MPI_T_ERR_PVAR_NO_WRITE     71  /* Pvar can't be written or reset */
+#define MPI_T_ERR_PVAR_NO_ATOMIC    72  /* Pvar can't be R/W atomically */
+#define MPI_T_ERR_INVALID_NAME      73  /* Name doesn't match */
+#define MPI_T_ERR_INVALID           74  /* Generic error code for MPI_T added in MPI-3.1 */
+
+
+#define MPI_ERR_LASTCODE    0x3fffffff  /* Last valid error code for a 
+					   predefined error class */
+/* WARNING: this is also defined in mpishared.h.  Update both locations */
+#define MPICH_ERR_LAST_CLASS 74     /* It is also helpful to know the
+				       last valid class */
+
+#define MPICH_ERR_FIRST_MPIX 100 /* Define a gap here because sock is
+                                  * already using some of the values in this
+                                  * range. All MPIX error codes will be
+                                  * above this value to be ABI complaint. */
+
+#define MPIX_ERR_PROC_FAILED          MPICH_ERR_FIRST_MPIX+1 /* Process failure */
+#define MPIX_ERR_PROC_FAILED_PENDING  MPICH_ERR_FIRST_MPIX+2 /* A failure has caused this request
+                                                              * to be pending */
+#define MPIX_ERR_REVOKED              MPICH_ERR_FIRST_MPIX+3 /* The communciation object has been revoked */
+
+#define MPICH_ERR_LAST_MPIX           MPICH_ERR_FIRST_MPIX+3
+
+
+/* End of MPI's error classes */
+
+/* Function type defs */
+typedef int (MPI_Datarep_conversion_function)(void *, MPI_Datatype, int, 
+             void *, MPI_Offset, void *);
+typedef int (MPI_Datarep_extent_function)(MPI_Datatype datatype, MPI_Aint *,
+                      void *);
+#define MPI_CONVERSION_FN_NULL ((MPI_Datarep_conversion_function *)0)
+
+/* 
+   For systems that may need to add additional definitions to support
+   different declaration styles and options (e.g., different calling 
+   conventions or DLL import/export controls).  
+*/
+/* --Insert Additional Definitions Here-- */
+
+/*
+ * Normally, we provide prototypes for all MPI routines.  In a few weird
+ * cases, we need to suppress the prototypes.
+ */
+#ifndef MPICH_SUPPRESS_PROTOTYPES
+/* We require that the C compiler support prototypes */
+/* Begin Prototypes */
+int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+             MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
+             MPI_Comm comm, MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Get_count(const MPI_Status *status, MPI_Datatype datatype, int *count);
+int MPI_Bsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+              MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+              MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Rsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+              MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Buffer_attach(void *buffer, int size);
+int MPI_Buffer_detach(void *buffer_addr, int *size);
+int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+              MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Ibsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+               MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Issend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+               MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Irsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+               MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
+              MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Wait(MPI_Request *request, MPI_Status *status);
+int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status);
+int MPI_Request_free(MPI_Request *request);
+int MPI_Waitany(int count, MPI_Request array_of_requests[], int *indx, MPI_Status *status);
+int MPI_Testany(int count, MPI_Request array_of_requests[], int *indx, int *flag,
+                MPI_Status *status);
+int MPI_Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[]);
+int MPI_Testall(int count, MPI_Request array_of_requests[], int *flag,
+                MPI_Status array_of_statuses[]);
+int MPI_Waitsome(int incount, MPI_Request array_of_requests[], int *outcount,
+                 int array_of_indices[], MPI_Status array_of_statuses[]);
+int MPI_Testsome(int incount, MPI_Request array_of_requests[], int *outcount,
+                 int array_of_indices[], MPI_Status array_of_statuses[]);
+int MPI_Iprobe(int source, int tag, MPI_Comm comm, int *flag, MPI_Status *status);
+int MPI_Probe(int source, int tag, MPI_Comm comm, MPI_Status *status);
+int MPI_Cancel(MPI_Request *request);
+int MPI_Test_cancelled(const MPI_Status *status, int *flag);
+int MPI_Send_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                  MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Bsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                   MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Ssend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                   MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Rsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                   MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source, int tag,
+                  MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Start(MPI_Request *request);
+int MPI_Startall(int count, MPI_Request array_of_requests[]);
+int MPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, int dest,
+                 int sendtag, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                 int source, int recvtag, MPI_Comm comm, MPI_Status *status)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(6,8);
+int MPI_Sendrecv_replace(void *buf, int count, MPI_Datatype datatype, int dest,
+                         int sendtag, int source, int recvtag, MPI_Comm comm,
+                         MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Type_contiguous(int count, MPI_Datatype oldtype, MPI_Datatype *newtype);
+int MPI_Type_vector(int count, int blocklength, int stride, MPI_Datatype oldtype,
+                    MPI_Datatype *newtype);
+int MPI_Type_hvector(int count, int blocklength, MPI_Aint stride, MPI_Datatype oldtype,
+                     MPI_Datatype *newtype);
+int MPI_Type_indexed(int count, const int *array_of_blocklengths,
+                     const int *array_of_displacements, MPI_Datatype oldtype,
+                     MPI_Datatype *newtype);
+int MPI_Type_hindexed(int count, const int *array_of_blocklengths,
+                      const MPI_Aint *array_of_displacements, MPI_Datatype oldtype,
+                      MPI_Datatype *newtype);
+int MPI_Type_struct(int count, const int *array_of_blocklengths,
+                    const MPI_Aint *array_of_displacements,
+                    const MPI_Datatype *array_of_types, MPI_Datatype *newtype);
+int MPI_Address(const void *location, MPI_Aint *address);
+int MPI_Type_extent(MPI_Datatype datatype, MPI_Aint *extent);
+int MPI_Type_size(MPI_Datatype datatype, int *size);
+int MPI_Type_lb(MPI_Datatype datatype, MPI_Aint *displacement);
+int MPI_Type_ub(MPI_Datatype datatype, MPI_Aint *displacement);
+int MPI_Type_commit(MPI_Datatype *datatype);
+int MPI_Type_free(MPI_Datatype *datatype);
+int MPI_Get_elements(const MPI_Status *status, MPI_Datatype datatype, int *count);
+int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf,
+             int outsize, int *position, MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Unpack(const void *inbuf, int insize, int *position, void *outbuf, int outcount,
+               MPI_Datatype datatype, MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm, int *size);
+int MPI_Barrier(MPI_Comm comm);
+int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+               int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root,
+                MPI_Comm comm)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Scatterv(const void *sendbuf, const int *sendcounts, const int *displs,
+                 MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                 int root, MPI_Comm comm)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7);
+int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                  int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                   const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPI_Comm comm)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int MPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Alltoallv(const void *sendbuf, const int *sendcounts, const int *sdispls,
+                  MPI_Datatype sendtype, void *recvbuf, const int *recvcounts,
+                  const int *rdispls, MPI_Datatype recvtype, MPI_Comm comm)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8);
+int MPI_Alltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                  const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                  const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm);
+int MPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+               MPI_Op op, MPI_Comm comm)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+               MPI_Op op, int root, MPI_Comm comm)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Op_create(MPI_User_function *user_fn, int commute, MPI_Op *op);
+int MPI_Op_free(MPI_Op *op);
+int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                  MPI_Op op, MPI_Comm comm)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Reduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
+                       MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Scan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+             MPI_Comm comm)
+             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Group_size(MPI_Group group, int *size);
+int MPI_Group_rank(MPI_Group group, int *rank);
+int MPI_Group_translate_ranks(MPI_Group group1, int n, const int ranks1[], MPI_Group group2,
+                              int ranks2[]);
+int MPI_Group_compare(MPI_Group group1, MPI_Group group2, int *result);
+int MPI_Comm_group(MPI_Comm comm, MPI_Group *group);
+int MPI_Group_union(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup);
+int MPI_Group_intersection(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup);
+int MPI_Group_difference(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup);
+int MPI_Group_incl(MPI_Group group, int n, const int ranks[], MPI_Group *newgroup);
+int MPI_Group_excl(MPI_Group group, int n, const int ranks[], MPI_Group *newgroup);
+int MPI_Group_range_incl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup);
+int MPI_Group_range_excl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup);
+int MPI_Group_free(MPI_Group *group);
+int MPI_Comm_size(MPI_Comm comm, int *size);
+int MPI_Comm_rank(MPI_Comm comm, int *rank);
+int MPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int *result);
+int MPI_Comm_dup(MPI_Comm comm, MPI_Comm *newcomm);
+int MPI_Comm_dup_with_info(MPI_Comm comm, MPI_Info info, MPI_Comm *newcomm);
+int MPI_Comm_create(MPI_Comm comm, MPI_Group group, MPI_Comm *newcomm);
+int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm *newcomm);
+int MPI_Comm_free(MPI_Comm *comm);
+int MPI_Comm_test_inter(MPI_Comm comm, int *flag);
+int MPI_Comm_remote_size(MPI_Comm comm, int *size);
+int MPI_Comm_remote_group(MPI_Comm comm, MPI_Group *group);
+int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader, MPI_Comm peer_comm,
+                         int remote_leader, int tag, MPI_Comm *newintercomm);
+int MPI_Intercomm_merge(MPI_Comm intercomm, int high, MPI_Comm *newintracomm);
+int MPI_Keyval_create(MPI_Copy_function *copy_fn, MPI_Delete_function *delete_fn,
+                      int *keyval, void *extra_state);
+int MPI_Keyval_free(int *keyval);
+int MPI_Attr_put(MPI_Comm comm, int keyval, void *attribute_val);
+int MPI_Attr_get(MPI_Comm comm, int keyval, void *attribute_val, int *flag);
+int MPI_Attr_delete(MPI_Comm comm, int keyval);
+int MPI_Topo_test(MPI_Comm comm, int *status);
+int MPI_Cart_create(MPI_Comm comm_old, int ndims, const int dims[], const int periods[],
+                    int reorder, MPI_Comm *comm_cart);
+int MPI_Dims_create(int nnodes, int ndims, int dims[]);
+int MPI_Graph_create(MPI_Comm comm_old, int nnodes, const int indx[], const int edges[],
+                     int reorder, MPI_Comm *comm_graph);
+int MPI_Graphdims_get(MPI_Comm comm, int *nnodes, int *nedges);
+int MPI_Graph_get(MPI_Comm comm, int maxindex, int maxedges, int indx[], int edges[]);
+int MPI_Cartdim_get(MPI_Comm comm, int *ndims);
+int MPI_Cart_get(MPI_Comm comm, int maxdims, int dims[], int periods[], int coords[]);
+int MPI_Cart_rank(MPI_Comm comm, const int coords[], int *rank);
+int MPI_Cart_coords(MPI_Comm comm, int rank, int maxdims, int coords[]);
+int MPI_Graph_neighbors_count(MPI_Comm comm, int rank, int *nneighbors);
+int MPI_Graph_neighbors(MPI_Comm comm, int rank, int maxneighbors, int neighbors[]);
+int MPI_Cart_shift(MPI_Comm comm, int direction, int disp, int *rank_source, int *rank_dest);
+int MPI_Cart_sub(MPI_Comm comm, const int remain_dims[], MPI_Comm *newcomm);
+int MPI_Cart_map(MPI_Comm comm, int ndims, const int dims[], const int periods[], int *newrank);
+int MPI_Graph_map(MPI_Comm comm, int nnodes, const int indx[], const int edges[], int *newrank);
+int MPI_Get_processor_name(char *name, int *resultlen);
+int MPI_Get_version(int *version, int *subversion);
+int MPI_Get_library_version(char *version, int *resultlen);
+int MPI_Errhandler_create(MPI_Handler_function *function, MPI_Errhandler *errhandler);
+int MPI_Errhandler_set(MPI_Comm comm, MPI_Errhandler errhandler);
+int MPI_Errhandler_get(MPI_Comm comm, MPI_Errhandler *errhandler);
+int MPI_Errhandler_free(MPI_Errhandler *errhandler);
+int MPI_Error_string(int errorcode, char *string, int *resultlen);
+int MPI_Error_class(int errorcode, int *errorclass);
+double MPI_Wtime(void);
+double MPI_Wtick(void);
+int MPI_Init(int *argc, char ***argv);
+int MPI_Finalize(void);
+int MPI_Initialized(int *flag);
+int MPI_Abort(MPI_Comm comm, int errorcode);
+
+/* Note that we may need to define a @PCONTROL_LIST@ depending on whether
+   stdargs are supported */
+int MPI_Pcontrol(const int level, ...);
+int MPI_DUP_FN(MPI_Comm oldcomm, int keyval, void *extra_state, void *attribute_val_in,
+               void *attribute_val_out, int *flag);
+
+/* Process Creation and Management */
+int MPI_Close_port(const char *port_name);
+int MPI_Comm_accept(const char *port_name, MPI_Info info, int root, MPI_Comm comm,
+                    MPI_Comm *newcomm);
+int MPI_Comm_connect(const char *port_name, MPI_Info info, int root, MPI_Comm comm,
+                     MPI_Comm *newcomm);
+int MPI_Comm_disconnect(MPI_Comm *comm);
+int MPI_Comm_get_parent(MPI_Comm *parent);
+int MPI_Comm_join(int fd, MPI_Comm *intercomm);
+int MPI_Comm_spawn(const char *command, char *argv[], int maxprocs, MPI_Info info, int root,
+                   MPI_Comm comm, MPI_Comm *intercomm, int array_of_errcodes[]);
+int MPI_Comm_spawn_multiple(int count, char *array_of_commands[], char **array_of_argv[],
+                            const int array_of_maxprocs[], const MPI_Info array_of_info[],
+                            int root, MPI_Comm comm, MPI_Comm *intercomm, int array_of_errcodes[]);
+int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name);
+int MPI_Open_port(MPI_Info info, char *port_name);
+int MPI_Publish_name(const char *service_name, MPI_Info info, const char *port_name);
+int MPI_Unpublish_name(const char *service_name, MPI_Info info, const char *port_name);
+int MPI_Comm_set_info(MPI_Comm comm, MPI_Info info);
+int MPI_Comm_get_info(MPI_Comm comm, MPI_Info *info);
+
+/* One-Sided Communications */
+int MPI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                   int target_rank, MPI_Aint target_disp, int target_count,
+                   MPI_Datatype target_datatype, MPI_Op op, MPI_Win win)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Get(void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+            int target_rank, MPI_Aint target_disp, int target_count,
+            MPI_Datatype target_datatype, MPI_Win win) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+            int target_rank, MPI_Aint target_disp, int target_count,
+            MPI_Datatype target_datatype, MPI_Win win) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Win_complete(MPI_Win win);
+int MPI_Win_create(void *base, MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm,
+                   MPI_Win *win);
+int MPI_Win_fence(int assert, MPI_Win win);
+int MPI_Win_free(MPI_Win *win);
+int MPI_Win_get_group(MPI_Win win, MPI_Group *group);
+int MPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win);
+int MPI_Win_post(MPI_Group group, int assert, MPI_Win win);
+int MPI_Win_start(MPI_Group group, int assert, MPI_Win win);
+int MPI_Win_test(MPI_Win win, int *flag);
+int MPI_Win_unlock(int rank, MPI_Win win);
+int MPI_Win_wait(MPI_Win win);
+
+/* MPI-3 One-Sided Communication Routines */
+int MPI_Win_allocate(MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm, void *baseptr,
+                     MPI_Win *win);
+int MPI_Win_allocate_shared(MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm,
+                            void *baseptr, MPI_Win *win);
+int MPI_Win_shared_query(MPI_Win win, int rank, MPI_Aint *size, int *disp_unit, void *baseptr);
+int MPI_Win_create_dynamic(MPI_Info info, MPI_Comm comm, MPI_Win *win);
+int MPI_Win_attach(MPI_Win win, void *base, MPI_Aint size);
+int MPI_Win_detach(MPI_Win win, const void *base);
+int MPI_Win_get_info(MPI_Win win, MPI_Info *info_used);
+int MPI_Win_set_info(MPI_Win win, MPI_Info info);
+int MPI_Get_accumulate(const void *origin_addr, int origin_count,
+                        MPI_Datatype origin_datatype, void *result_addr, int result_count,
+                        MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
+                        int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Fetch_and_op(const void *origin_addr, void *result_addr,
+                      MPI_Datatype datatype, int target_rank, MPI_Aint target_disp,
+                      MPI_Op op, MPI_Win win)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
+                          void *result_addr, MPI_Datatype datatype, int target_rank,
+                          MPI_Aint target_disp, MPI_Win win)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,4);
+int MPI_Rput(const void *origin_addr, int origin_count,
+              MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
+              int target_count, MPI_Datatype target_datatype, MPI_Win win,
+              MPI_Request *request)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Rget(void *origin_addr, int origin_count,
+              MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
+              int target_count, MPI_Datatype target_datatype, MPI_Win win,
+              MPI_Request *request)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Raccumulate(const void *origin_addr, int origin_count,
+                     MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
+                     int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win,
+                     MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Rget_accumulate(const void *origin_addr, int origin_count,
+                         MPI_Datatype origin_datatype, void *result_addr, int result_count,
+                         MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
+                         int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win,
+                         MPI_Request *request)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Win_lock_all(int assert, MPI_Win win);
+int MPI_Win_unlock_all(MPI_Win win);
+int MPI_Win_flush(int rank, MPI_Win win);
+int MPI_Win_flush_all(MPI_Win win);
+int MPI_Win_flush_local(int rank, MPI_Win win);
+int MPI_Win_flush_local_all(MPI_Win win);
+int MPI_Win_sync(MPI_Win win);
+ 
+/* External Interfaces */
+int MPI_Add_error_class(int *errorclass);
+int MPI_Add_error_code(int errorclass, int *errorcode);
+int MPI_Add_error_string(int errorcode, const char *string);
+int MPI_Comm_call_errhandler(MPI_Comm comm, int errorcode);
+int MPI_Comm_create_keyval(MPI_Comm_copy_attr_function *comm_copy_attr_fn,
+                           MPI_Comm_delete_attr_function *comm_delete_attr_fn, int *comm_keyval,
+                           void *extra_state);
+int MPI_Comm_delete_attr(MPI_Comm comm, int comm_keyval);
+int MPI_Comm_free_keyval(int *comm_keyval);
+int MPI_Comm_get_attr(MPI_Comm comm, int comm_keyval, void *attribute_val, int *flag);
+int MPI_Comm_get_name(MPI_Comm comm, char *comm_name, int *resultlen);
+int MPI_Comm_set_attr(MPI_Comm comm, int comm_keyval, void *attribute_val);
+int MPI_Comm_set_name(MPI_Comm comm, const char *comm_name);
+int MPI_File_call_errhandler(MPI_File fh, int errorcode);
+int MPI_Grequest_complete(MPI_Request request);
+int MPI_Grequest_start(MPI_Grequest_query_function *query_fn, MPI_Grequest_free_function *free_fn,
+                       MPI_Grequest_cancel_function *cancel_fn, void *extra_state,
+                       MPI_Request *request);
+int MPI_Init_thread(int *argc, char ***argv, int required, int *provided);
+int MPI_Is_thread_main(int *flag);
+int MPI_Query_thread(int *provided);
+int MPI_Status_set_cancelled(MPI_Status *status, int flag);
+int MPI_Status_set_elements(MPI_Status *status, MPI_Datatype datatype, int count);
+int MPI_Type_create_keyval(MPI_Type_copy_attr_function *type_copy_attr_fn,
+                           MPI_Type_delete_attr_function *type_delete_attr_fn,
+                           int *type_keyval, void *extra_state);
+int MPI_Type_delete_attr(MPI_Datatype datatype, int type_keyval);
+int MPI_Type_dup(MPI_Datatype oldtype, MPI_Datatype *newtype);
+int MPI_Type_free_keyval(int *type_keyval);
+int MPI_Type_get_attr(MPI_Datatype datatype, int type_keyval, void *attribute_val, int *flag);
+int MPI_Type_get_contents(MPI_Datatype datatype, int max_integers, int max_addresses,
+                          int max_datatypes, int array_of_integers[],
+                          MPI_Aint array_of_addresses[], MPI_Datatype array_of_datatypes[]);
+int MPI_Type_get_envelope(MPI_Datatype datatype, int *num_integers, int *num_addresses,
+                          int *num_datatypes, int *combiner);
+int MPI_Type_get_name(MPI_Datatype datatype, char *type_name, int *resultlen);
+int MPI_Type_set_attr(MPI_Datatype datatype, int type_keyval, void *attribute_val);
+int MPI_Type_set_name(MPI_Datatype datatype, const char *type_name);
+int MPI_Type_match_size(int typeclass, int size, MPI_Datatype *datatype);
+int MPI_Win_call_errhandler(MPI_Win win, int errorcode);
+int MPI_Win_create_keyval(MPI_Win_copy_attr_function *win_copy_attr_fn,
+                          MPI_Win_delete_attr_function *win_delete_attr_fn, int *win_keyval,
+                          void *extra_state);
+int MPI_Win_delete_attr(MPI_Win win, int win_keyval);
+int MPI_Win_free_keyval(int *win_keyval);
+int MPI_Win_get_attr(MPI_Win win, int win_keyval, void *attribute_val, int *flag);
+int MPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen);
+int MPI_Win_set_attr(MPI_Win win, int win_keyval, void *attribute_val);
+int MPI_Win_set_name(MPI_Win win, const char *win_name);
+
+int MPI_Alloc_mem(MPI_Aint size, MPI_Info info, void *baseptr);
+int MPI_Comm_create_errhandler(MPI_Comm_errhandler_function *comm_errhandler_fn,
+                               MPI_Errhandler *errhandler);
+int MPI_Comm_get_errhandler(MPI_Comm comm, MPI_Errhandler *errhandler);
+int MPI_Comm_set_errhandler(MPI_Comm comm, MPI_Errhandler errhandler);
+int MPI_File_create_errhandler(MPI_File_errhandler_function *file_errhandler_fn,
+                               MPI_Errhandler *errhandler);
+int MPI_File_get_errhandler(MPI_File file, MPI_Errhandler *errhandler);
+int MPI_File_set_errhandler(MPI_File file, MPI_Errhandler errhandler);
+int MPI_Finalized(int *flag);
+int MPI_Free_mem(void *base);
+int MPI_Get_address(const void *location, MPI_Aint *address);
+int MPI_Info_create(MPI_Info *info);
+int MPI_Info_delete(MPI_Info info, const char *key);
+int MPI_Info_dup(MPI_Info info, MPI_Info *newinfo);
+int MPI_Info_free(MPI_Info *info);
+int MPI_Info_get(MPI_Info info, const char *key, int valuelen, char *value, int *flag);
+int MPI_Info_get_nkeys(MPI_Info info, int *nkeys);
+int MPI_Info_get_nthkey(MPI_Info info, int n, char *key);
+int MPI_Info_get_valuelen(MPI_Info info, const char *key, int *valuelen, int *flag);
+int MPI_Info_set(MPI_Info info, const char *key, const char *value);
+int MPI_Pack_external(const char datarep[], const void *inbuf, int incount,
+                      MPI_Datatype datatype, void *outbuf, MPI_Aint outsize, MPI_Aint *position)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Pack_external_size(const char datarep[], int incount, MPI_Datatype datatype,
+                           MPI_Aint *size);
+int MPI_Request_get_status(MPI_Request request, int *flag, MPI_Status *status);
+int MPI_Status_c2f(const MPI_Status *c_status, MPI_Fint *f_status);
+int MPI_Status_f2c(const MPI_Fint *f_status, MPI_Status *c_status);
+int MPI_Type_create_darray(int size, int rank, int ndims, const int array_of_gsizes[],
+                           const int array_of_distribs[], const int array_of_dargs[],
+                           const int array_of_psizes[], int order, MPI_Datatype oldtype,
+                           MPI_Datatype *newtype);
+int MPI_Type_create_hindexed(int count, const int array_of_blocklengths[],
+                             const MPI_Aint array_of_displacements[], MPI_Datatype oldtype,
+                             MPI_Datatype *newtype);
+int MPI_Type_create_hvector(int count, int blocklength, MPI_Aint stride, MPI_Datatype oldtype,
+                            MPI_Datatype *newtype);
+int MPI_Type_create_indexed_block(int count, int blocklength, const int array_of_displacements[],
+                                  MPI_Datatype oldtype, MPI_Datatype *newtype);
+int MPI_Type_create_hindexed_block(int count, int blocklength,
+                                   const MPI_Aint array_of_displacements[],
+                                   MPI_Datatype oldtype, MPI_Datatype *newtype);
+int MPI_Type_create_resized(MPI_Datatype oldtype, MPI_Aint lb, MPI_Aint extent,
+                            MPI_Datatype *newtype);
+int MPI_Type_create_struct(int count, const int array_of_blocklengths[],
+                           const MPI_Aint array_of_displacements[],
+                           const MPI_Datatype array_of_types[], MPI_Datatype *newtype);
+int MPI_Type_create_subarray(int ndims, const int array_of_sizes[],
+                             const int array_of_subsizes[], const int array_of_starts[],
+                             int order, MPI_Datatype oldtype, MPI_Datatype *newtype);
+int MPI_Type_get_extent(MPI_Datatype datatype, MPI_Aint *lb, MPI_Aint *extent);
+int MPI_Type_get_true_extent(MPI_Datatype datatype, MPI_Aint *true_lb, MPI_Aint *true_extent);
+int MPI_Unpack_external(const char datarep[], const void *inbuf, MPI_Aint insize,
+                        MPI_Aint *position, void *outbuf, int outcount, MPI_Datatype datatype)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7);
+int MPI_Win_create_errhandler(MPI_Win_errhandler_function *win_errhandler_fn,
+                              MPI_Errhandler *errhandler);
+int MPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandler);
+int MPI_Win_set_errhandler(MPI_Win win, MPI_Errhandler errhandler);
+
+/* Fortran 90-related functions.  These routines are available only if
+   Fortran 90 support is enabled 
+*/
+int MPI_Type_create_f90_integer(int range, MPI_Datatype *newtype);
+int MPI_Type_create_f90_real(int precision, int range, MPI_Datatype *newtype);
+int MPI_Type_create_f90_complex(int precision, int range, MPI_Datatype *newtype);
+
+int MPI_Reduce_local(const void *inbuf, void *inoutbuf, int count, MPI_Datatype datatype,
+                     MPI_Op op)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Op_commutative(MPI_Op op, int *commute);
+int MPI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
+                             MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Dist_graph_create_adjacent(MPI_Comm comm_old, int indegree, const int sources[],
+                                   const int sourceweights[], int outdegree,
+                                   const int destinations[], const int destweights[],
+                                   MPI_Info info, int reorder, MPI_Comm *comm_dist_graph);
+int MPI_Dist_graph_create(MPI_Comm comm_old, int n, const int sources[], const int degrees[],
+                          const int destinations[], const int weights[], MPI_Info info,
+                          int reorder, MPI_Comm *comm_dist_graph);
+int MPI_Dist_graph_neighbors_count(MPI_Comm comm, int *indegree, int *outdegree, int *weighted);
+int MPI_Dist_graph_neighbors(MPI_Comm comm, int maxindegree, int sources[], int sourceweights[],
+                             int maxoutdegree, int destinations[], int destweights[]);
+
+/* Matched probe functionality */
+int MPI_Improbe(int source, int tag, MPI_Comm comm, int *flag, MPI_Message *message,
+                MPI_Status *status);
+int MPI_Imrecv(void *buf, int count, MPI_Datatype datatype, MPI_Message *message,
+               MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Mprobe(int source, int tag, MPI_Comm comm, MPI_Message *message, MPI_Status *status);
+int MPI_Mrecv(void *buf, int count, MPI_Datatype datatype, MPI_Message *message,
+              MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+
+/* Nonblocking collectives */
+int MPI_Comm_idup(MPI_Comm comm, MPI_Comm *newcomm, MPI_Request *request);
+int MPI_Ibarrier(MPI_Comm comm, MPI_Request *request);
+int MPI_Ibcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm,
+               MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int MPI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
+                MPI_Request *request)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root,
+                 MPI_Comm comm, MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int MPI_Iscatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
+                 MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Iscatterv(const void *sendbuf, const int sendcounts[], const int displs[],
+                  MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                  int root, MPI_Comm comm, MPI_Request *request)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7);
+int MPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                   int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                    const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                    MPI_Comm comm, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int MPI_Ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                  int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Ialltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                   MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                   const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
+                   MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8);
+int MPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                   const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                   const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
+                   MPI_Request *request);
+int MPI_Ireduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                MPI_Op op, int root, MPI_Comm comm, MPI_Request *request)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                   MPI_Op op, MPI_Comm comm, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
+                        MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
+                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm,
+                              MPI_Request *request)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+              MPI_Comm comm, MPI_Request *request)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                MPI_Op op, MPI_Comm comm, MPI_Request *request)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+
+/* Neighborhood collectives */
+int MPI_Ineighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                            void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                            MPI_Comm comm, MPI_Request *request)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Ineighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                             void *recvbuf, const int recvcounts[], const int displs[],
+                             MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int MPI_Ineighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                           void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                           MPI_Request *request)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Ineighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                            MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                            const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
+                            MPI_Request *request)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8);
+int MPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                            const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                            void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[],
+                            const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Request *request);
+int MPI_Neighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                           void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Neighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                            void *recvbuf, const int recvcounts[], const int displs[],
+                            MPI_Datatype recvtype, MPI_Comm comm)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int MPI_Neighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                          void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int MPI_Neighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                           MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                           const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8);
+int MPI_Neighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[],
+                           const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                           const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm);
+
+/* Shared memory */
+int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm *newcomm);
+
+/* MPI-3 "large count" routines */
+int MPI_Get_elements_x(const MPI_Status *status, MPI_Datatype datatype, MPI_Count *count);
+int MPI_Status_set_elements_x(MPI_Status *status, MPI_Datatype datatype, MPI_Count count);
+int MPI_Type_get_extent_x(MPI_Datatype datatype, MPI_Count *lb, MPI_Count *extent);
+int MPI_Type_get_true_extent_x(MPI_Datatype datatype, MPI_Count *lb, MPI_Count *extent);
+int MPI_Type_size_x(MPI_Datatype datatype, MPI_Count *size);
+
+/* Noncollective communicator creation */
+int MPI_Comm_create_group(MPI_Comm comm, MPI_Group group, int tag, MPI_Comm *newcomm);
+
+/* MPI_Aint addressing arithmetic */
+MPI_Aint MPI_Aint_add(MPI_Aint base, MPI_Aint disp);
+MPI_Aint MPI_Aint_diff(MPI_Aint addr1, MPI_Aint addr2);
+
+/* MPI_T interface */
+/* The MPI_T routines are available only in C bindings - tell tools that they
+   can skip these prototypes */
+/* Begin Skip Prototypes */
+int MPI_T_init_thread(int required, int *provided);
+int MPI_T_finalize(void);
+int MPI_T_enum_get_info(MPI_T_enum enumtype, int *num, char *name, int *name_len);
+int MPI_T_enum_get_item(MPI_T_enum enumtype, int indx, int *value, char *name, int *name_len);
+int MPI_T_cvar_get_num(int *num_cvar);
+int MPI_T_cvar_get_info(int cvar_index, char *name, int *name_len, int *verbosity,
+                        MPI_Datatype *datatype, MPI_T_enum *enumtype, char *desc, int *desc_len,
+                        int *binding, int *scope);
+int MPI_T_cvar_handle_alloc(int cvar_index, void *obj_handle, MPI_T_cvar_handle *handle,
+                            int *count);
+int MPI_T_cvar_handle_free(MPI_T_cvar_handle *handle);
+int MPI_T_cvar_read(MPI_T_cvar_handle handle, void *buf);
+int MPI_T_cvar_write(MPI_T_cvar_handle handle, const void *buf);
+int MPI_T_pvar_get_num(int *num_pvar);
+int MPI_T_pvar_get_info(int pvar_index, char *name, int *name_len, int *verbosity, int *var_class,
+                        MPI_Datatype *datatype, MPI_T_enum *enumtype, char *desc, int *desc_len,
+                        int *binding, int *readonly, int *continuous, int *atomic);
+int MPI_T_pvar_session_create(MPI_T_pvar_session *session);
+int MPI_T_pvar_session_free(MPI_T_pvar_session *session);
+int MPI_T_pvar_handle_alloc(MPI_T_pvar_session session, int pvar_index, void *obj_handle,
+                            MPI_T_pvar_handle *handle, int *count);
+int MPI_T_pvar_handle_free(MPI_T_pvar_session session, MPI_T_pvar_handle *handle);
+int MPI_T_pvar_start(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+int MPI_T_pvar_stop(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+int MPI_T_pvar_read(MPI_T_pvar_session session, MPI_T_pvar_handle handle, void *buf);
+int MPI_T_pvar_write(MPI_T_pvar_session session, MPI_T_pvar_handle handle, const void *buf);
+int MPI_T_pvar_reset(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+int MPI_T_pvar_readreset(MPI_T_pvar_session session, MPI_T_pvar_handle handle, void *buf);
+int MPI_T_category_get_num(int *num_cat);
+int MPI_T_category_get_info(int cat_index, char *name, int *name_len, char *desc, int *desc_len,
+                            int *num_cvars, int *num_pvars, int *num_categories);
+int MPI_T_category_get_cvars(int cat_index, int len, int indices[]);
+int MPI_T_category_get_pvars(int cat_index, int len, int indices[]);
+int MPI_T_category_get_categories(int cat_index, int len, int indices[]);
+int MPI_T_category_changed(int *stamp);
+int MPI_T_cvar_get_index(const char *name, int *cvar_index);
+int MPI_T_pvar_get_index(const char *name, int var_class, int *pvar_index);
+int MPI_T_category_get_index(const char *name, int *cat_index);
+/* End Skip Prototypes */
+
+
+/* Non-standard but public extensions to MPI */
+/* Fault Tolerance Extensions */
+int MPIX_Comm_failure_ack(MPI_Comm comm);
+int MPIX_Comm_failure_get_acked(MPI_Comm comm, MPI_Group *failedgrp);
+int MPIX_Comm_revoke(MPI_Comm comm);
+int MPIX_Comm_shrink(MPI_Comm comm, MPI_Comm *newcomm);
+int MPIX_Comm_agree(MPI_Comm comm, int *flag);
+
+
+/* End Prototypes */
+#endif /* MPICH_SUPPRESS_PROTOTYPES */
+
+
+/* Here are the bindings of the profiling routines */
+#if !defined(MPI_BUILD_PROFILING)
+int PMPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+              MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
+              MPI_Comm comm, MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Get_count(const MPI_Status *status, MPI_Datatype datatype, int *count);
+int PMPI_Bsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+               MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+               MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Rsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+               MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Buffer_attach(void *buffer, int size);
+int PMPI_Buffer_detach(void *buffer_addr, int *size);
+int PMPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+               MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Ibsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Issend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Irsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
+               MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Wait(MPI_Request *request, MPI_Status *status);
+int PMPI_Test(MPI_Request *request, int *flag, MPI_Status *status);
+int PMPI_Request_free(MPI_Request *request);
+int PMPI_Waitany(int count, MPI_Request array_of_requests[], int *indx, MPI_Status *status);
+int PMPI_Testany(int count, MPI_Request array_of_requests[], int *indx, int *flag,
+                 MPI_Status *status);
+int PMPI_Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[]);
+int PMPI_Testall(int count, MPI_Request array_of_requests[], int *flag,
+                 MPI_Status array_of_statuses[]);
+int PMPI_Waitsome(int incount, MPI_Request array_of_requests[], int *outcount,
+                  int array_of_indices[], MPI_Status array_of_statuses[]);
+int PMPI_Testsome(int incount, MPI_Request array_of_requests[], int *outcount,
+                  int array_of_indices[], MPI_Status array_of_statuses[]);
+int PMPI_Iprobe(int source, int tag, MPI_Comm comm, int *flag, MPI_Status *status);
+int PMPI_Probe(int source, int tag, MPI_Comm comm, MPI_Status *status);
+int PMPI_Cancel(MPI_Request *request);
+int PMPI_Test_cancelled(const MPI_Status *status, int *flag);
+int PMPI_Send_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                   MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Bsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                    MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Ssend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                    MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Rsend_init(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
+                    MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Recv_init(void *buf, int count, MPI_Datatype datatype, int source, int tag,
+                   MPI_Comm comm, MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Start(MPI_Request *request);
+int PMPI_Startall(int count, MPI_Request array_of_requests[]);
+int PMPI_Sendrecv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, int dest,
+                  int sendtag, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                  int source, int recvtag, MPI_Comm comm, MPI_Status *status)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(6,8);
+int PMPI_Sendrecv_replace(void *buf, int count, MPI_Datatype datatype, int dest,
+                          int sendtag, int source, int recvtag, MPI_Comm comm,
+                          MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Type_contiguous(int count, MPI_Datatype oldtype, MPI_Datatype *newtype);
+int PMPI_Type_vector(int count, int blocklength, int stride, MPI_Datatype oldtype,
+                     MPI_Datatype *newtype);
+int PMPI_Type_hvector(int count, int blocklength, MPI_Aint stride, MPI_Datatype oldtype,
+                      MPI_Datatype *newtype);
+int PMPI_Type_indexed(int count, const int *array_of_blocklengths,
+                      const int *array_of_displacements, MPI_Datatype oldtype,
+                      MPI_Datatype *newtype);
+int PMPI_Type_hindexed(int count, const int *array_of_blocklengths,
+                       const MPI_Aint *array_of_displacements, MPI_Datatype oldtype,
+                       MPI_Datatype *newtype);
+int PMPI_Type_struct(int count, const int *array_of_blocklengths,
+                     const MPI_Aint *array_of_displacements,
+                     const MPI_Datatype *array_of_types, MPI_Datatype *newtype);
+int PMPI_Address(const void *location, MPI_Aint *address);
+int PMPI_Type_extent(MPI_Datatype datatype, MPI_Aint *extent);
+int PMPI_Type_size(MPI_Datatype datatype, int *size);
+int PMPI_Type_lb(MPI_Datatype datatype, MPI_Aint *displacement);
+int PMPI_Type_ub(MPI_Datatype datatype, MPI_Aint *displacement);
+int PMPI_Type_commit(MPI_Datatype *datatype);
+int PMPI_Type_free(MPI_Datatype *datatype);
+int PMPI_Get_elements(const MPI_Status *status, MPI_Datatype datatype, int *count);
+int PMPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf,
+              int outsize, int *position, MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Unpack(const void *inbuf, int insize, int *position, void *outbuf, int outcount,
+                MPI_Datatype datatype, MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm, int *size);
+int PMPI_Barrier(MPI_Comm comm);
+int PMPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root,
+                 MPI_Comm comm)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int PMPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Scatterv(const void *sendbuf, const int *sendcounts, const int *displs,
+                  MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                  int root, MPI_Comm comm)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7);
+int PMPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                   int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                    const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPI_Comm comm)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int PMPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                  int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Alltoallv(const void *sendbuf, const int *sendcounts, const int *sdispls,
+                   MPI_Datatype sendtype, void *recvbuf, const int *recvcounts,
+                   const int *rdispls, MPI_Datatype recvtype, MPI_Comm comm)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8);
+int PMPI_Alltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                   const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                   const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm);
+int PMPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                MPI_Op op, MPI_Comm comm)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                MPI_Op op, int root, MPI_Comm comm)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Op_create(MPI_User_function *user_fn, int commute, MPI_Op *op);
+int PMPI_Op_free(MPI_Op *op);
+int PMPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                   MPI_Op op, MPI_Comm comm)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Reduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
+                        MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Scan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+              MPI_Comm comm)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Group_size(MPI_Group group, int *size);
+int PMPI_Group_rank(MPI_Group group, int *rank);
+int PMPI_Group_translate_ranks(MPI_Group group1, int n, const int ranks1[], MPI_Group group2,
+                               int ranks2[]);
+int PMPI_Group_compare(MPI_Group group1, MPI_Group group2, int *result);
+int PMPI_Comm_group(MPI_Comm comm, MPI_Group *group);
+int PMPI_Group_union(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup);
+int PMPI_Group_intersection(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup);
+int PMPI_Group_difference(MPI_Group group1, MPI_Group group2, MPI_Group *newgroup);
+int PMPI_Group_incl(MPI_Group group, int n, const int ranks[], MPI_Group *newgroup);
+int PMPI_Group_excl(MPI_Group group, int n, const int ranks[], MPI_Group *newgroup);
+int PMPI_Group_range_incl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup);
+int PMPI_Group_range_excl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup);
+int PMPI_Group_free(MPI_Group *group);
+int PMPI_Comm_size(MPI_Comm comm, int *size);
+int PMPI_Comm_rank(MPI_Comm comm, int *rank);
+int PMPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int *result);
+int PMPI_Comm_dup(MPI_Comm comm, MPI_Comm *newcomm);
+int PMPI_Comm_dup_with_info(MPI_Comm comm, MPI_Info info, MPI_Comm *newcomm);
+int PMPI_Comm_create(MPI_Comm comm, MPI_Group group, MPI_Comm *newcomm);
+int PMPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm *newcomm);
+int PMPI_Comm_free(MPI_Comm *comm);
+int PMPI_Comm_test_inter(MPI_Comm comm, int *flag);
+int PMPI_Comm_remote_size(MPI_Comm comm, int *size);
+int PMPI_Comm_remote_group(MPI_Comm comm, MPI_Group *group);
+int PMPI_Intercomm_create(MPI_Comm local_comm, int local_leader, MPI_Comm peer_comm,
+                          int remote_leader, int tag, MPI_Comm *newintercomm);
+int PMPI_Intercomm_merge(MPI_Comm intercomm, int high, MPI_Comm *newintracomm);
+int PMPI_Keyval_create(MPI_Copy_function *copy_fn, MPI_Delete_function *delete_fn,
+                       int *keyval, void *extra_state);
+int PMPI_Keyval_free(int *keyval);
+int PMPI_Attr_put(MPI_Comm comm, int keyval, void *attribute_val);
+int PMPI_Attr_get(MPI_Comm comm, int keyval, void *attribute_val, int *flag);
+int PMPI_Attr_delete(MPI_Comm comm, int keyval);
+int PMPI_Topo_test(MPI_Comm comm, int *status);
+int PMPI_Cart_create(MPI_Comm comm_old, int ndims, const int dims[], const int periods[],
+                     int reorder, MPI_Comm *comm_cart);
+int PMPI_Dims_create(int nnodes, int ndims, int dims[]);
+int PMPI_Graph_create(MPI_Comm comm_old, int nnodes, const int indx[], const int edges[],
+                      int reorder, MPI_Comm *comm_graph);
+int PMPI_Graphdims_get(MPI_Comm comm, int *nnodes, int *nedges);
+int PMPI_Graph_get(MPI_Comm comm, int maxindex, int maxedges, int indx[], int edges[]);
+int PMPI_Cartdim_get(MPI_Comm comm, int *ndims);
+int PMPI_Cart_get(MPI_Comm comm, int maxdims, int dims[], int periods[], int coords[]);
+int PMPI_Cart_rank(MPI_Comm comm, const int coords[], int *rank);
+int PMPI_Cart_coords(MPI_Comm comm, int rank, int maxdims, int coords[]);
+int PMPI_Graph_neighbors_count(MPI_Comm comm, int rank, int *nneighbors);
+int PMPI_Graph_neighbors(MPI_Comm comm, int rank, int maxneighbors, int neighbors[]);
+int PMPI_Cart_shift(MPI_Comm comm, int direction, int disp, int *rank_source, int *rank_dest);
+int PMPI_Cart_sub(MPI_Comm comm, const int remain_dims[], MPI_Comm *newcomm);
+int PMPI_Cart_map(MPI_Comm comm, int ndims, const int dims[], const int periods[], int *newrank);
+int PMPI_Graph_map(MPI_Comm comm, int nnodes, const int indx[], const int edges[], int *newrank);
+int PMPI_Get_processor_name(char *name, int *resultlen);
+int PMPI_Get_version(int *version, int *subversion);
+int PMPI_Get_library_version(char *version, int *resultlen);
+int PMPI_Errhandler_create(MPI_Handler_function *function, MPI_Errhandler *errhandler);
+int PMPI_Errhandler_set(MPI_Comm comm, MPI_Errhandler errhandler);
+int PMPI_Errhandler_get(MPI_Comm comm, MPI_Errhandler *errhandler);
+int PMPI_Errhandler_free(MPI_Errhandler *errhandler);
+int PMPI_Error_string(int errorcode, char *string, int *resultlen);
+int PMPI_Error_class(int errorcode, int *errorclass);
+double PMPI_Wtime(void);
+double PMPI_Wtick(void);
+int PMPI_Init(int *argc, char ***argv);
+int PMPI_Finalize(void);
+int PMPI_Initialized(int *flag);
+int PMPI_Abort(MPI_Comm comm, int errorcode);
+
+/* Note that we may need to define a @PCONTROL_LIST@ depending on whether
+   stdargs are supported */
+int PMPI_Pcontrol(const int level, ...);
+
+/* Process Creation and Management */
+int PMPI_Close_port(const char *port_name);
+int PMPI_Comm_accept(const char *port_name, MPI_Info info, int root, MPI_Comm comm,
+                     MPI_Comm *newcomm);
+int PMPI_Comm_connect(const char *port_name, MPI_Info info, int root, MPI_Comm comm,
+                      MPI_Comm *newcomm);
+int PMPI_Comm_disconnect(MPI_Comm *comm);
+int PMPI_Comm_get_parent(MPI_Comm *parent);
+int PMPI_Comm_join(int fd, MPI_Comm *intercomm);
+int PMPI_Comm_spawn(const char *command, char *argv[], int maxprocs, MPI_Info info, int root,
+                    MPI_Comm comm, MPI_Comm *intercomm, int array_of_errcodes[]);
+int PMPI_Comm_spawn_multiple(int count, char *array_of_commands[], char **array_of_argv[],
+                             const int array_of_maxprocs[], const MPI_Info array_of_info[],
+                             int root, MPI_Comm comm, MPI_Comm *intercomm, int array_of_errcodes[]);
+int PMPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name);
+int PMPI_Open_port(MPI_Info info, char *port_name);
+int PMPI_Publish_name(const char *service_name, MPI_Info info, const char *port_name);
+int PMPI_Unpublish_name(const char *service_name, MPI_Info info, const char *port_name);
+int PMPI_Comm_set_info(MPI_Comm comm, MPI_Info info);
+int PMPI_Comm_get_info(MPI_Comm comm, MPI_Info *info);
+
+/* One-Sided Communications */
+int PMPI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+                    int target_rank, MPI_Aint target_disp, int target_count,
+                    MPI_Datatype target_datatype, MPI_Op op, MPI_Win win)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Get(void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+             int target_rank, MPI_Aint target_disp, int target_count,
+             MPI_Datatype target_datatype, MPI_Win win) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+             int target_rank, MPI_Aint target_disp, int target_count,
+             MPI_Datatype target_datatype, MPI_Win win) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Win_complete(MPI_Win win);
+int PMPI_Win_create(void *base, MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm,
+                    MPI_Win *win);
+int PMPI_Win_fence(int assert, MPI_Win win);
+int PMPI_Win_free(MPI_Win *win);
+int PMPI_Win_get_group(MPI_Win win, MPI_Group *group);
+int PMPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win);
+int PMPI_Win_post(MPI_Group group, int assert, MPI_Win win);
+int PMPI_Win_start(MPI_Group group, int assert, MPI_Win win);
+int PMPI_Win_test(MPI_Win win, int *flag);
+int PMPI_Win_unlock(int rank, MPI_Win win);
+int PMPI_Win_wait(MPI_Win win);
+
+/* MPI-3 One-Sided Communication Routines */
+int PMPI_Win_allocate(MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm, void *baseptr,
+                      MPI_Win *win);
+int PMPI_Win_allocate_shared(MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm,
+                             void *baseptr, MPI_Win *win);
+int PMPI_Win_shared_query(MPI_Win win, int rank, MPI_Aint *size, int *disp_unit, void *baseptr);
+int PMPI_Win_create_dynamic(MPI_Info info, MPI_Comm comm, MPI_Win *win);
+int PMPI_Win_attach(MPI_Win win, void *base, MPI_Aint size);
+int PMPI_Win_detach(MPI_Win win, const void *base);
+int PMPI_Win_get_info(MPI_Win win, MPI_Info *info_used);
+int PMPI_Win_set_info(MPI_Win win, MPI_Info info);
+int PMPI_Get_accumulate(const void *origin_addr, int origin_count,
+                         MPI_Datatype origin_datatype, void *result_addr, int result_count,
+                         MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
+                         int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Fetch_and_op(const void *origin_addr, void *result_addr,
+                       MPI_Datatype datatype, int target_rank, MPI_Aint target_disp,
+                       MPI_Op op, MPI_Win win)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Compare_and_swap(const void *origin_addr, const void *compare_addr,
+                           void *result_addr, MPI_Datatype datatype, int target_rank,
+                           MPI_Aint target_disp, MPI_Win win)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,4);
+int PMPI_Rput(const void *origin_addr, int origin_count,
+               MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
+               int target_count, MPI_Datatype target_datatype, MPI_Win win,
+               MPI_Request *request)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Rget(void *origin_addr, int origin_count,
+               MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
+               int target_count, MPI_Datatype target_datatype, MPI_Win win,
+               MPI_Request *request)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Raccumulate(const void *origin_addr, int origin_count,
+                      MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
+                      int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win,
+                      MPI_Request *request)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Rget_accumulate(const void *origin_addr, int origin_count,
+                          MPI_Datatype origin_datatype, void *result_addr, int result_count,
+                          MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
+                          int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win,
+                          MPI_Request *request)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Win_lock_all(int assert, MPI_Win win);
+int PMPI_Win_unlock_all(MPI_Win win);
+int PMPI_Win_flush(int rank, MPI_Win win);
+int PMPI_Win_flush_all(MPI_Win win);
+int PMPI_Win_flush_local(int rank, MPI_Win win);
+int PMPI_Win_flush_local_all(MPI_Win win);
+int PMPI_Win_sync(MPI_Win win);
+ 
+/* External Interfaces */
+int PMPI_Add_error_class(int *errorclass);
+int PMPI_Add_error_code(int errorclass, int *errorcode);
+int PMPI_Add_error_string(int errorcode, const char *string);
+int PMPI_Comm_call_errhandler(MPI_Comm comm, int errorcode);
+int PMPI_Comm_create_keyval(MPI_Comm_copy_attr_function *comm_copy_attr_fn,
+                            MPI_Comm_delete_attr_function *comm_delete_attr_fn, int *comm_keyval,
+                            void *extra_state);
+int PMPI_Comm_delete_attr(MPI_Comm comm, int comm_keyval);
+int PMPI_Comm_free_keyval(int *comm_keyval);
+int PMPI_Comm_get_attr(MPI_Comm comm, int comm_keyval, void *attribute_val, int *flag);
+int PMPI_Comm_get_name(MPI_Comm comm, char *comm_name, int *resultlen);
+int PMPI_Comm_set_attr(MPI_Comm comm, int comm_keyval, void *attribute_val);
+int PMPI_Comm_set_name(MPI_Comm comm, const char *comm_name);
+int PMPI_File_call_errhandler(MPI_File fh, int errorcode);
+int PMPI_Grequest_complete(MPI_Request request);
+int PMPI_Grequest_start(MPI_Grequest_query_function *query_fn, MPI_Grequest_free_function *free_fn,
+                        MPI_Grequest_cancel_function *cancel_fn, void *extra_state,
+                        MPI_Request *request);
+int PMPI_Init_thread(int *argc, char ***argv, int required, int *provided);
+int PMPI_Is_thread_main(int *flag);
+int PMPI_Query_thread(int *provided);
+int PMPI_Status_set_cancelled(MPI_Status *status, int flag);
+int PMPI_Status_set_elements(MPI_Status *status, MPI_Datatype datatype, int count);
+int PMPI_Type_create_keyval(MPI_Type_copy_attr_function *type_copy_attr_fn,
+                            MPI_Type_delete_attr_function *type_delete_attr_fn,
+                            int *type_keyval, void *extra_state);
+int PMPI_Type_delete_attr(MPI_Datatype datatype, int type_keyval);
+int PMPI_Type_dup(MPI_Datatype oldtype, MPI_Datatype *newtype);
+int PMPI_Type_free_keyval(int *type_keyval);
+int PMPI_Type_get_attr(MPI_Datatype datatype, int type_keyval, void *attribute_val, int *flag);
+int PMPI_Type_get_contents(MPI_Datatype datatype, int max_integers, int max_addresses,
+                           int max_datatypes, int array_of_integers[],
+                           MPI_Aint array_of_addresses[], MPI_Datatype array_of_datatypes[]);
+int PMPI_Type_get_envelope(MPI_Datatype datatype, int *num_integers, int *num_addresses,
+                           int *num_datatypes, int *combiner);
+int PMPI_Type_get_name(MPI_Datatype datatype, char *type_name, int *resultlen);
+int PMPI_Type_set_attr(MPI_Datatype datatype, int type_keyval, void *attribute_val);
+int PMPI_Type_set_name(MPI_Datatype datatype, const char *type_name);
+int PMPI_Type_match_size(int typeclass, int size, MPI_Datatype *datatype);
+int PMPI_Win_call_errhandler(MPI_Win win, int errorcode);
+int PMPI_Win_create_keyval(MPI_Win_copy_attr_function *win_copy_attr_fn,
+                           MPI_Win_delete_attr_function *win_delete_attr_fn, int *win_keyval,
+                           void *extra_state);
+int PMPI_Win_delete_attr(MPI_Win win, int win_keyval);
+int PMPI_Win_free_keyval(int *win_keyval);
+int PMPI_Win_get_attr(MPI_Win win, int win_keyval, void *attribute_val, int *flag);
+int PMPI_Win_get_name(MPI_Win win, char *win_name, int *resultlen);
+int PMPI_Win_set_attr(MPI_Win win, int win_keyval, void *attribute_val);
+int PMPI_Win_set_name(MPI_Win win, const char *win_name);
+
+int PMPI_Alloc_mem(MPI_Aint size, MPI_Info info, void *baseptr);
+int PMPI_Comm_create_errhandler(MPI_Comm_errhandler_function *comm_errhandler_fn,
+                                MPI_Errhandler *errhandler);
+int PMPI_Comm_get_errhandler(MPI_Comm comm, MPI_Errhandler *errhandler);
+int PMPI_Comm_set_errhandler(MPI_Comm comm, MPI_Errhandler errhandler);
+int PMPI_File_create_errhandler(MPI_File_errhandler_function *file_errhandler_fn,
+                                MPI_Errhandler *errhandler);
+int PMPI_File_get_errhandler(MPI_File file, MPI_Errhandler *errhandler);
+int PMPI_File_set_errhandler(MPI_File file, MPI_Errhandler errhandler);
+int PMPI_Finalized(int *flag);
+int PMPI_Free_mem(void *base);
+int PMPI_Get_address(const void *location, MPI_Aint *address);
+int PMPI_Info_create(MPI_Info *info);
+int PMPI_Info_delete(MPI_Info info, const char *key);
+int PMPI_Info_dup(MPI_Info info, MPI_Info *newinfo);
+int PMPI_Info_free(MPI_Info *info);
+int PMPI_Info_get(MPI_Info info, const char *key, int valuelen, char *value, int *flag);
+int PMPI_Info_get_nkeys(MPI_Info info, int *nkeys);
+int PMPI_Info_get_nthkey(MPI_Info info, int n, char *key);
+int PMPI_Info_get_valuelen(MPI_Info info, const char *key, int *valuelen, int *flag);
+int PMPI_Info_set(MPI_Info info, const char *key, const char *value);
+int PMPI_Pack_external(const char datarep[], const void *inbuf, int incount,
+                       MPI_Datatype datatype, void *outbuf, MPI_Aint outsize, MPI_Aint *position)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Pack_external_size(const char datarep[], int incount, MPI_Datatype datatype,
+                            MPI_Aint *size);
+int PMPI_Request_get_status(MPI_Request request, int *flag, MPI_Status *status);
+int PMPI_Status_c2f(const MPI_Status *c_status, MPI_Fint *f_status);
+int PMPI_Status_f2c(const MPI_Fint *f_status, MPI_Status *c_status);
+int PMPI_Type_create_darray(int size, int rank, int ndims, const int array_of_gsizes[],
+                            const int array_of_distribs[], const int array_of_dargs[],
+                            const int array_of_psizes[], int order, MPI_Datatype oldtype,
+                            MPI_Datatype *newtype);
+int PMPI_Type_create_hindexed(int count, const int array_of_blocklengths[],
+                              const MPI_Aint array_of_displacements[], MPI_Datatype oldtype,
+                              MPI_Datatype *newtype);
+int PMPI_Type_create_hvector(int count, int blocklength, MPI_Aint stride, MPI_Datatype oldtype,
+                             MPI_Datatype *newtype);
+int PMPI_Type_create_indexed_block(int count, int blocklength, const int array_of_displacements[],
+                                   MPI_Datatype oldtype, MPI_Datatype *newtype);
+int PMPI_Type_create_hindexed_block(int count, int blocklength,
+                                    const MPI_Aint array_of_displacements[],
+                                    MPI_Datatype oldtype, MPI_Datatype *newtype);
+int PMPI_Type_create_resized(MPI_Datatype oldtype, MPI_Aint lb, MPI_Aint extent,
+                             MPI_Datatype *newtype);
+int PMPI_Type_create_struct(int count, const int array_of_blocklengths[],
+                            const MPI_Aint array_of_displacements[],
+                            const MPI_Datatype array_of_types[], MPI_Datatype *newtype);
+int PMPI_Type_create_subarray(int ndims, const int array_of_sizes[],
+                              const int array_of_subsizes[], const int array_of_starts[],
+                              int order, MPI_Datatype oldtype, MPI_Datatype *newtype);
+int PMPI_Type_get_extent(MPI_Datatype datatype, MPI_Aint *lb, MPI_Aint *extent);
+int PMPI_Type_get_true_extent(MPI_Datatype datatype, MPI_Aint *true_lb, MPI_Aint *true_extent);
+int PMPI_Unpack_external(const char datarep[], const void *inbuf, MPI_Aint insize,
+                         MPI_Aint *position, void *outbuf, int outcount, MPI_Datatype datatype)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7);
+int PMPI_Win_create_errhandler(MPI_Win_errhandler_function *win_errhandler_fn,
+                               MPI_Errhandler *errhandler);
+int PMPI_Win_get_errhandler(MPI_Win win, MPI_Errhandler *errhandler);
+int PMPI_Win_set_errhandler(MPI_Win win, MPI_Errhandler errhandler);
+
+/* Fortran 90-related functions.  These routines are available only if
+   Fortran 90 support is enabled 
+*/
+int PMPI_Type_create_f90_integer(int r, MPI_Datatype *newtype);
+int PMPI_Type_create_f90_real(int p, int r, MPI_Datatype *newtype);
+int PMPI_Type_create_f90_complex(int p, int r, MPI_Datatype *newtype);
+
+int PMPI_Reduce_local(const void *inbuf, void *inoutbuf, int count, MPI_Datatype datatype,
+                      MPI_Op op)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Op_commutative(MPI_Op op, int *commute);
+int PMPI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
+                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Dist_graph_create_adjacent(MPI_Comm comm_old, int indegree, const int sources[],
+                                    const int sourceweights[], int outdegree,
+                                    const int destinations[], const int destweights[],
+                                    MPI_Info info, int reorder, MPI_Comm *comm_dist_graph);
+int PMPI_Dist_graph_create(MPI_Comm comm_old, int n, const int sources[], const int degrees[],
+                           const int destinations[], const int weights[], MPI_Info info,
+                           int reorder, MPI_Comm *comm_dist_graph);
+int PMPI_Dist_graph_neighbors_count(MPI_Comm comm, int *indegree, int *outdegree, int *weighted);
+int PMPI_Dist_graph_neighbors(MPI_Comm comm, int maxindegree, int sources[], int sourceweights[],
+                              int maxoutdegree, int destinations[], int destweights[]);
+
+/* Matched probe functionality */
+int PMPI_Improbe(int source, int tag, MPI_Comm comm, int *flag, MPI_Message *message,
+                 MPI_Status *status);
+int PMPI_Imrecv(void *buf, int count, MPI_Datatype datatype, MPI_Message *message,
+                MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Mprobe(int source, int tag, MPI_Comm comm, MPI_Message *message, MPI_Status *status);
+int PMPI_Mrecv(void *buf, int count, MPI_Datatype datatype, MPI_Message *message,
+               MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+
+/* Nonblocking collectives */
+int PMPI_Comm_idup(MPI_Comm comm, MPI_Comm *newcomm, MPI_Request *request);
+int PMPI_Ibarrier(MPI_Comm comm, MPI_Request *request);
+int PMPI_Ibcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm,
+                MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3);
+int PMPI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
+                 MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                  const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root,
+                  MPI_Comm comm, MPI_Request *request)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int PMPI_Iscatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                  int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
+                  MPI_Request *request)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Iscatterv(const void *sendbuf, const int sendcounts[], const int displs[],
+                   MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                   int root, MPI_Comm comm, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7);
+int PMPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                    int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                     const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                     MPI_Comm comm, MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int PMPI_Ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                   int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Ialltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                    MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                    const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
+                    MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8);
+int PMPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                    const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                    const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
+                    MPI_Request *request);
+int PMPI_Ireduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                 MPI_Op op, int root, MPI_Comm comm, MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                    MPI_Op op, MPI_Comm comm, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
+                         MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
+                               MPI_Datatype datatype, MPI_Op op, MPI_Comm comm,
+                               MPI_Request *request)
+                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+               MPI_Comm comm, MPI_Request *request)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                 MPI_Op op, MPI_Comm comm, MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+
+/* Neighborhood collectives */
+int PMPI_Ineighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                             void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                             MPI_Comm comm, MPI_Request *request)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Ineighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                              void *recvbuf, const int recvcounts[], const int displs[],
+                              MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int PMPI_Ineighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                            void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                            MPI_Request *request)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Ineighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                             MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                             const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
+                             MPI_Request *request)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8);
+int PMPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[],
+                             const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                             void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[],
+                             const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Request *request);
+int PMPI_Neighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                            void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Neighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                             void *recvbuf, const int recvcounts[], const int displs[],
+                             MPI_Datatype recvtype, MPI_Comm comm)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7);
+int PMPI_Neighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                           void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6);
+int PMPI_Neighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                            MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                            const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8);
+int PMPI_Neighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[],
+                            const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                            const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                            MPI_Comm comm);
+
+/* Shared memory */
+int PMPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm *newcomm);
+
+/* Noncollective communicator creation */
+int PMPI_Comm_create_group(MPI_Comm comm, MPI_Group group, int tag, MPI_Comm *newcomm);
+
+/* MPI-3 "large count" routines */
+int PMPI_Get_elements_x(const MPI_Status *status, MPI_Datatype datatype, MPI_Count *count);
+int PMPI_Status_set_elements_x(MPI_Status *status, MPI_Datatype datatype, MPI_Count count);
+int PMPI_Type_get_extent_x(MPI_Datatype datatype, MPI_Count *lb, MPI_Count *extent);
+int PMPI_Type_get_true_extent_x(MPI_Datatype datatype, MPI_Count *lb, MPI_Count *extent);
+int PMPI_Type_size_x(MPI_Datatype datatype, MPI_Count *size);
+
+/* MPI_Aint addressing arithmetic */
+MPI_Aint PMPI_Aint_add(MPI_Aint base, MPI_Aint disp);
+MPI_Aint PMPI_Aint_diff(MPI_Aint addr1, MPI_Aint addr2);
+
+/* MPI_T interface */
+/* The MPI_T routines are available only in C bindings - tell tools that they
+   can skip these prototypes */
+/* Begin Skip Prototypes */
+int PMPI_T_init_thread(int required, int *provided);
+int PMPI_T_finalize(void);
+int PMPI_T_enum_get_info(MPI_T_enum enumtype, int *num, char *name, int *name_len);
+int PMPI_T_enum_get_item(MPI_T_enum enumtype, int indx, int *value, char *name, int *name_len);
+int PMPI_T_cvar_get_num(int *num_cvar);
+int PMPI_T_cvar_get_info(int cvar_index, char *name, int *name_len, int *verbosity,
+                         MPI_Datatype *datatype, MPI_T_enum *enumtype, char *desc, int *desc_len,
+                         int *binding, int *scope);
+int PMPI_T_cvar_handle_alloc(int cvar_index, void *obj_handle, MPI_T_cvar_handle *handle,
+                             int *count);
+int PMPI_T_cvar_handle_free(MPI_T_cvar_handle *handle);
+int PMPI_T_cvar_read(MPI_T_cvar_handle handle, void *buf);
+int PMPI_T_cvar_write(MPI_T_cvar_handle handle, const void *buf);
+int PMPI_T_pvar_get_num(int *num_pvar);
+int PMPI_T_pvar_get_info(int pvar_index, char *name, int *name_len, int *verbosity, int *var_class,
+                         MPI_Datatype *datatype, MPI_T_enum *enumtype, char *desc, int *desc_len,
+                         int *binding, int *readonly, int *continuous, int *atomic);
+int PMPI_T_pvar_session_create(MPI_T_pvar_session *session);
+int PMPI_T_pvar_session_free(MPI_T_pvar_session *session);
+int PMPI_T_pvar_handle_alloc(MPI_T_pvar_session session, int pvar_index, void *obj_handle,
+                             MPI_T_pvar_handle *handle, int *count);
+int PMPI_T_pvar_handle_free(MPI_T_pvar_session session, MPI_T_pvar_handle *handle);
+int PMPI_T_pvar_start(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+int PMPI_T_pvar_stop(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+int PMPI_T_pvar_read(MPI_T_pvar_session session, MPI_T_pvar_handle handle, void *buf);
+int PMPI_T_pvar_write(MPI_T_pvar_session session, MPI_T_pvar_handle handle, const void *buf);
+int PMPI_T_pvar_reset(MPI_T_pvar_session session, MPI_T_pvar_handle handle);
+int PMPI_T_pvar_readreset(MPI_T_pvar_session session, MPI_T_pvar_handle handle, void *buf);
+int PMPI_T_category_get_num(int *num_cat);
+int PMPI_T_category_get_info(int cat_index, char *name, int *name_len, char *desc, int *desc_len,
+                             int *num_cvars, int *num_pvars, int *num_categories);
+int PMPI_T_category_get_cvars(int cat_index, int len, int indices[]);
+int PMPI_T_category_get_pvars(int cat_index, int len, int indices[]);
+int PMPI_T_category_get_categories(int cat_index, int len, int indices[]);
+int PMPI_T_category_changed(int *stamp);
+int PMPI_T_cvar_get_index(const char *name, int *cvar_index);
+int PMPI_T_pvar_get_index(const char *name, int var_class, int *pvar_index);
+int PMPI_T_category_get_index(const char *name, int *cat_index);
+/* End Skip Prototypes */
+
+
+/* Non-standard but public extensions to MPI */
+/* Fault Tolerance Extensions */
+int PMPIX_Comm_failure_ack(MPI_Comm comm);
+int PMPIX_Comm_failure_get_acked(MPI_Comm comm, MPI_Group *failedgrp);
+int PMPIX_Comm_revoke(MPI_Comm comm);
+int PMPIX_Comm_shrink(MPI_Comm comm, MPI_Comm *newcomm);
+int PMPIX_Comm_agree(MPI_Comm comm, int *flag);
+
+#endif  /* MPI_BUILD_PROFILING */
+/* End of MPI bindings */
+
+/* feature advertisement */
+#define MPIIMPL_ADVERTISES_FEATURES 1
+#define MPIIMPL_HAVE_MPI_INFO 1                                                 
+#define MPIIMPL_HAVE_MPI_COMBINER_DARRAY 1                                      
+#define MPIIMPL_HAVE_MPI_TYPE_CREATE_DARRAY 1
+#define MPIIMPL_HAVE_MPI_COMBINER_SUBARRAY 1                                    
+#define MPIIMPL_HAVE_MPI_TYPE_CREATE_DARRAY 1
+#define MPIIMPL_HAVE_MPI_COMBINER_DUP 1                                         
+#define MPIIMPL_HAVE_MPI_GREQUEST 1      
+#define MPIIMPL_HAVE_STATUS_SET_BYTES 1
+#define MPIIMPL_HAVE_STATUS_SET_INFO 1
+
+#include "mpio.h"
+
+#if defined(__cplusplus)
+}
+/* Add the C++ bindings */
+/* 
+   If MPICH_SKIP_MPICXX is defined, the mpicxx.h file will *not* be included.
+   This is necessary, for example, when building the C++ interfaces.  It
+   can also be used when you want to use a C++ compiler to compile C code,
+   and do not want to load the C++ bindings.  These definitions can
+   be made by the C++ compilation script
+ */
+#if !defined(MPICH_SKIP_MPICXX)
+/* mpicxx.h contains the MPI C++ binding.  In the mpi.h.in file, this 
+   include is in an autoconf variable in case the compiler is a C++ 
+   compiler but MPI was built without the C++ bindings */
+#include "mpicxx.h"
+#endif 
+#endif
+
+
+/* Generalized requests extensions */
+typedef int MPIX_Grequest_class;
+int MPIX_Grequest_class_create(MPI_Grequest_query_function *query_fn,
+                               MPI_Grequest_free_function *free_fn,
+                               MPI_Grequest_cancel_function *cancel_fn,
+                               MPIX_Grequest_poll_function *poll_fn,
+                               MPIX_Grequest_wait_function *wait_fn,
+                               MPIX_Grequest_class *greq_class);
+int MPIX_Grequest_class_allocate(MPIX_Grequest_class greq_class, void *extra_state,
+                                 MPI_Request *request);
+int MPIX_Grequest_start(MPI_Grequest_query_function *query_fn,
+                        MPI_Grequest_free_function *free_fn,
+                        MPI_Grequest_cancel_function *cancel_fn,
+                        MPIX_Grequest_poll_function *poll_fn,
+                        MPIX_Grequest_wait_function *wait_fn, void *extra_state,
+                        MPI_Request *request);
+
+/* RMA Mutexes Extensions */
+struct mpixi_mutex_s;
+typedef struct mpixi_mutex_s * MPIX_Mutex;
+int MPIX_Mutex_create(int count, MPI_Comm comm, MPIX_Mutex *hdl);
+int MPIX_Mutex_free(MPIX_Mutex *hdl);
+int MPIX_Mutex_lock(MPIX_Mutex hdl, int mutex, int proc);
+int MPIX_Mutex_unlock(MPIX_Mutex hdl, int mutex, int proc);
+
+
+#if !defined(MPI_BUILD_PROFILING)
+/* Generalized requests extensions */
+int PMPIX_Grequest_class_create(MPI_Grequest_query_function *query_fn,
+                                MPI_Grequest_free_function *free_fn,
+                                MPI_Grequest_cancel_function *cancel_fn,
+                                MPIX_Grequest_poll_function *poll_fn,
+                                MPIX_Grequest_wait_function *wait_fn,
+                                MPIX_Grequest_class *greq_class);
+int PMPIX_Grequest_class_allocate(MPIX_Grequest_class greq_class, void *extra_state,
+                                  MPI_Request *request);
+int PMPIX_Grequest_start(MPI_Grequest_query_function *query_fn,
+                         MPI_Grequest_free_function *free_fn,
+                         MPI_Grequest_cancel_function *cancel_fn,
+                         MPIX_Grequest_poll_function *poll_fn,
+                         MPIX_Grequest_wait_function *wait_fn, void *extra_state,
+                         MPI_Request *request);
+
+/* RMA Mutexes Extensions */
+int PMPIX_Mutex_create(int count, MPI_Comm comm, MPIX_Mutex *hdl);
+int PMPIX_Mutex_free(MPIX_Mutex *hdl);
+int PMPIX_Mutex_lock(MPIX_Mutex hdl, int mutex, int proc);
+int PMPIX_Mutex_unlock(MPIX_Mutex hdl, int mutex, int proc);
+#endif /* MPI_BUILD_PROFILING */
+
+#endif
diff --git a/include/mpicxx.h b/include/mpicxx.h
new file mode 100644
index 000000000..4f2f84ccf
--- /dev/null
+++ b/include/mpicxx.h
@@ -0,0 +1,2742 @@
+/* -*- Mode: C++; c-basic-offset:4 ; -*- */
+/*  
+ *  (C) 2001 by Argonne National Laboratory.
+ *      See COPYRIGHT in top-level directory.
+ *
+ * This file is automatically generated by buildiface -nosep -initfile=cxx.vlist
+ * DO NOT EDIT
+ */
+/* style: c++ header */
+
+#ifdef MPI
+#error "You cannot define MPI; that name is reserved for the MPI namespace"
+#endif
+
+// Check for incompatible GCC versions
+// GCC (specifically) g++ changed the calling convention
+// between 3.2.3 and 3.4.3 (!!)  Normally such changes
+// should only occur at major releases (e.g., version 3 to 4)
+#ifdef __GNUC__ 
+# if __GNUC__ >= 4 
+#  if __GNUC_MINOR__ > 2 && 8 == 2 
+#  error 'Please use the same version of GCC and g++ for compiling MPICH and user MPI programs'
+#  endif
+# endif     
+#endif
+
+/* 
+ * Because the API is defined, some methods have parameters that are 
+ * not used.  The following definition allows us to suppress warnings
+ * about unused arguments in methods when -Wall -Wextra are specified.
+ * this definition is removed at the end of this file.
+ */
+#ifdef MPIR_ARGUNUSED
+#error MPIR_ARGUNUSED defined
+#endif
+#if defined(__GNUC__) && __GNUC__ >= 4 
+#define MPIR_ARGUNUSED __attribute__((unused))
+#else
+#define MPIR_ARGUNUSED
+#endif
+// There is a name conflict between stdio.h and iostream (or iostream.h)
+// and the MPI C++ binding with respect to the names SEEK_SET, SEEK_CUR, 
+// and SEEK_END.  MPI wants these in the MPI namespace, but stdio.h, 
+// iostream, or iostream.h will #define these to integer values.  
+// #undef'ing these can cause obscure problems.  
+#ifndef MPICH_IGNORE_CXX_SEEK
+
+// MPICH_DONT_INCLUDE_STDIO_H is another escape hatch for us, just like
+// MPICH_IGNORE_CXX_SEEK.  If we encounter a wacky environment or user in the
+// wild that does not want our workaround and/or the stdio.h header, then we can
+// offer them a way out.
+#ifndef MPICH_DONT_INCLUDE_STDIO_H
+// ensure that we have SEEK_* defined
+# include <stdio.h>
+#endif
+
+enum MPIR_Dummy_seek_type {
+    MPIR_DUMMY_SEEK_COMMA_VAL = -1  // permits cleaner comma logic
+#ifdef SEEK_SET
+    , MPIR_SEEK_SET = SEEK_SET
+#   undef SEEK_SET
+    , SEEK_SET = MPIR_SEEK_SET
+#endif
+#ifdef SEEK_CUR
+    , MPIR_SEEK_CUR = SEEK_CUR
+#   undef SEEK_CUR
+    , SEEK_CUR = MPIR_SEEK_CUR
+#endif
+#ifdef SEEK_END
+    , MPIR_SEEK_END = SEEK_END
+#   undef SEEK_END
+    , SEEK_END = MPIR_SEEK_END
+#endif
+#ifdef LOCK_SHARED
+    , MPIR_LOCK_SHARED = LOCK_SHARED
+#   undef LOCK_SHARED
+    , LOCK_SHARED = MPIR_LOCK_SHARED
+#endif
+};
+
+#endif // MPICH_IGNORE_CXX_SEEK
+namespace MPI {
+#if 1
+#define MPIX_CALLREF( _objptr, fnc ) \
+    { int err = fnc; if (err) { (_objptr)->Call_errhandler( err ); }}
+#define MPIX_CALLOBJ( _obj, fnc ) \
+    { int err = fnc; if (err) { (_obj).Call_errhandler( err ); }}
+#define MPIX_CALLWORLD( fnc ) \
+    { int err = fnc ; if (err) MPIR_Call_world_errhand( err ); }
+extern void MPIR_Call_world_errhand( int );
+#else
+#define MPIX_CALLREF( _objptr, fnc ) (void)fnc
+#define MPIX_CALLOBJ( _obj, fnc ) (void)fnc
+#define MPIX_CALLWORLD( fnc ) (void)fnc
+#endif
+
+// Typedefs for basic int types
+typedef MPI_Offset Offset;
+typedef MPI_Aint   Aint;
+typedef MPI_Fint   Fint;
+
+// Special internal routine
+void MPIR_CXX_InitDatatypeNames( void );
+
+// Forward class declarations
+class Comm;
+class Nullcomm;
+class Intercomm;
+class Intracomm;
+class Cartcomm;
+class Graphcomm;
+class File;
+
+// base (classless) routines
+extern     int Detach_buffer( void *&v1 ) ;
+extern     bool Is_initialized( void ) ;
+extern     void Get_processor_name( char * v1, int &v2 ) ;
+extern     void Get_error_string( int v1, char * v2, int &v3 ) ;
+extern     void Compute_dims( int v1, int v2, int v3[] ) ;
+extern     void Get_version( int &v1, int &v2 ) ;
+extern     void Finalize( void ) ;
+extern     void Pcontrol( const int v1, ... ) ;
+extern     void Attach_buffer( void * v1, int v2 ) ;
+extern     int Get_error_class( int v1 ) ;
+extern     Intracomm COMM_WORLD;
+extern     File FILE_NULL;
+
+class Exception  {
+
+  protected:
+    int the_real_exception;
+
+  public:
+    // new/delete
+
+    inline Exception(int obj) : the_real_exception(obj) {}
+    inline Exception(void) : the_real_exception(0) {}
+
+    virtual ~Exception() {}
+    // copy/assignment
+
+    Exception(const Exception &obj) : the_real_exception(obj.the_real_exception){}
+
+    Exception& operator=(const Exception &obj) {
+      the_real_exception = obj.the_real_exception; return *this; }
+
+    // logical
+    bool operator== (const Exception &obj) {
+      return (the_real_exception == obj.the_real_exception); }
+    bool operator!= (const Exception &obj) {
+      return (the_real_exception != obj.the_real_exception); }
+    // C/C++ cast and assignment
+    inline operator int*() { return &the_real_exception; }
+    inline operator int() const { return the_real_exception; }
+    Exception& operator=(const int& obj) {
+      the_real_exception = obj; return *this; }
+
+  protected:
+    char the_error_message[MPI_MAX_ERROR_STRING];
+  public:
+    int Get_error_code(void) { return the_real_exception; } 
+    int Get_error_class(void) { return MPI::Get_error_class(the_real_exception); } 
+    const char *Get_error_string(void)
+    {
+	int len;
+	MPI_Error_string(the_real_exception, the_error_message, &len);
+	return the_error_message;
+    }
+};
+
+class Datatype  {
+    friend class Comm;
+    friend class Status;
+    friend class Intracomm;
+    friend class Intercomm;
+    friend class Win;
+    friend class File;
+    friend class Op;
+
+  protected:
+    MPI_Datatype the_real_datatype;
+
+  public:
+    // new/delete
+
+    inline Datatype(MPI_Datatype obj) : the_real_datatype(obj) {}
+    inline Datatype(void) : the_real_datatype(MPI_DATATYPE_NULL) {}
+
+    virtual ~Datatype() {}
+    // copy/assignment
+
+    Datatype(const Datatype &obj) : the_real_datatype(obj.the_real_datatype){}
+
+    Datatype& operator=(const Datatype &obj) {
+      the_real_datatype = obj.the_real_datatype; return *this; }
+
+    // logical
+    bool operator== (const Datatype &obj) {
+      return (the_real_datatype == obj.the_real_datatype); }
+    bool operator!= (const Datatype &obj) {
+      return (the_real_datatype != obj.the_real_datatype); }
+    // C/C++ cast and assignment
+    inline operator MPI_Datatype*() { return &the_real_datatype; }
+    inline operator MPI_Datatype() const { return the_real_datatype; }
+    Datatype& operator=(const MPI_Datatype& obj) {
+      the_real_datatype = obj; return *this; }
+    virtual void Commit( void ) 
+    {
+        MPIX_CALLWORLD( MPI_Type_commit( (MPI_Datatype *) &the_real_datatype ));
+    }
+    virtual void Free( void ) 
+    {
+        MPIX_CALLWORLD( MPI_Type_free( (MPI_Datatype *) &the_real_datatype ));
+    }
+    virtual Datatype Create_indexed( int v1, const int * v2, const int * v3 ) const
+    {
+        Datatype v5;
+        MPIX_CALLWORLD( MPI_Type_indexed( v1, (const int *)v2, (const int *)v3, (MPI_Datatype) the_real_datatype, &(v5.the_real_datatype) ));
+        return v5;
+    }
+    virtual Datatype Create_contiguous( int v1 ) const
+    {
+        Datatype v3;
+        MPIX_CALLWORLD( MPI_Type_contiguous( v1, (MPI_Datatype) the_real_datatype, &(v3.the_real_datatype) ));
+        return v3;
+    }
+    virtual Datatype Create_vector( int v1, int v2, int v3 ) const
+    {
+        Datatype v5;
+        MPIX_CALLWORLD( MPI_Type_vector( v1, v2, v3, (MPI_Datatype) the_real_datatype, &(v5.the_real_datatype) ));
+        return v5;
+    }
+    static Datatype Create_struct( int v1, int v2[], Aint v3[],  const Datatype v4[] ) 
+    {
+        Datatype v5;
+        MPI_Datatype *l4 = new MPI_Datatype[v1];
+        { 
+            int i4; 
+            for (i4=0;i4<v1;i4++) {
+                l4[i4] = v4[i4].the_real_datatype;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Type_create_struct( v1, (const int  *)v2, (const MPI_Aint  *)v3, l4, &(v5.the_real_datatype) ));
+                    delete[] l4;
+        return v5;
+    }
+    virtual int Pack_size( int v1, const Comm &v3 ) const;
+    virtual void Pack( const void * v1, int v2, void * v4, int v5, int &v6, const Comm &v7 ) const;
+    virtual int Get_size( void ) const
+    {
+        int v2;
+        MPIX_CALLWORLD( MPI_Type_size( (MPI_Datatype) the_real_datatype, &v2 ));
+        return v2;
+    }
+    virtual void Get_envelope( int &v2, int &v3, int &v4, int &v5 ) const
+    {
+        MPIX_CALLWORLD( MPI_Type_get_envelope( (MPI_Datatype) the_real_datatype, &v2, &v3, &v4, &v5 ));
+    }
+    virtual Datatype Create_hvector( int v1, int v2, Aint v3 ) const
+    {
+        Datatype v5;
+        MPIX_CALLWORLD( MPI_Type_create_hvector( v1, v2, v3, (MPI_Datatype) the_real_datatype, &(v5.the_real_datatype) ));
+        return v5;
+    }
+    static Datatype Match_size( int v1, int v2 ) 
+    {
+        Datatype v3;
+        MPIX_CALLWORLD( MPI_Type_match_size( v1, v2, &(v3.the_real_datatype) ));
+        return v3;
+    }
+    virtual Datatype Create_resized( const Aint v2, const Aint v3 ) const
+    {
+        Datatype v4;
+        MPIX_CALLWORLD( MPI_Type_create_resized( (MPI_Datatype) the_real_datatype, (MPI_Aint)v2, (MPI_Aint)v3, &(v4.the_real_datatype) ));
+        return v4;
+    }
+    virtual Datatype Create_indexed_block( int v1, int v2, const int v3[] ) const
+    {
+        Datatype v5;
+        MPIX_CALLWORLD( MPI_Type_create_indexed_block( v1, v2, (const int  *)v3, (MPI_Datatype) the_real_datatype, &(v5.the_real_datatype) ));
+        return v5;
+    }
+    virtual Aint Pack_external_size( const char v1[], int v2 ) const
+    {
+        MPI_Aint v4;
+        MPIX_CALLWORLD( MPI_Pack_external_size( v1, v2, (MPI_Datatype) the_real_datatype, &v4 ));
+        return v4;
+    }
+    Datatype Dup( void ) const
+    {
+        Datatype v2;
+        MPIX_CALLWORLD( MPI_Type_dup( (MPI_Datatype) the_real_datatype, &(v2.the_real_datatype) ));
+        return v2;
+    }
+    virtual Datatype Create_hindexed( int v1, const int v2[], const Aint v3[] ) const
+    {
+        Datatype v5;
+        MPIX_CALLWORLD( MPI_Type_create_hindexed( v1, (const int  *)v2, (const MPI_Aint  *)v3, (MPI_Datatype) the_real_datatype, &(v5.the_real_datatype) ));
+        return v5;
+    }
+    virtual bool Get_attr( int v2, void * v3 ) const
+    {
+        int v4;
+        MPIX_CALLWORLD( MPI_Type_get_attr( (MPI_Datatype) the_real_datatype, v2, v3, &v4 ));
+        return v4!= 0;
+    }
+    virtual void Get_true_extent( Aint & v2, Aint & v3 ) const
+    {
+        MPIX_CALLWORLD( MPI_Type_get_true_extent( (MPI_Datatype) the_real_datatype, &v2, &v3 ));
+    }
+    virtual Datatype Create_darray( int v1, int v2, int v3, const int v4[], const int v5[], const int v6[], const int v7[], int v8 ) const
+    {
+        Datatype v10;
+        MPIX_CALLWORLD( MPI_Type_create_darray( v1, v2, v3, (const int  *)v4, (const int  *)v5, (const int  *)v6, (const int  *)v7, v8, (MPI_Datatype) the_real_datatype, &(v10.the_real_datatype) ));
+        return v10;
+    }
+    static Datatype Create_f90_real( int v1, int v2 ) 
+    {
+        Datatype v3;
+        MPIX_CALLWORLD( MPI_Type_create_f90_real( v1, v2, &(v3.the_real_datatype) ));
+        return v3;
+    }
+    virtual void Get_contents( int v2, int v3, int v4, int v5[], Aint v6[], Datatype v7[] ) const
+    {
+        MPI_Datatype *l7 = new MPI_Datatype[v4];
+        MPIX_CALLWORLD( MPI_Type_get_contents( (MPI_Datatype) the_real_datatype, v2, v3, v4, v5, v6, l7 ));
+        { 
+            int i7; 
+            for (i7=0;i7<v4;i7++) {
+                v7[i7].the_real_datatype = l7[i7];
+            }
+            delete[] l7;
+        }
+    }
+    virtual void Set_attr( int v2, const void * v3 ) 
+    {
+        MPIX_CALLWORLD( MPI_Type_set_attr( (MPI_Datatype) the_real_datatype, v2, (void *)v3 ));
+    }
+    virtual void Set_name( const char * v2 ) 
+    {
+        MPIX_CALLWORLD( MPI_Type_set_name( (MPI_Datatype) the_real_datatype, (const char *)v2 ));
+    }
+    static Datatype Create_f90_complex( int v1, int v2 ) 
+    {
+        Datatype v3;
+        MPIX_CALLWORLD( MPI_Type_create_f90_complex( v1, v2, &(v3.the_real_datatype) ));
+        return v3;
+    }
+    virtual Datatype Create_subarray( int v1, const int v2[], const int v3[], const int v4[], const int v5 ) const
+    {
+        Datatype v7;
+        MPIX_CALLWORLD( MPI_Type_create_subarray( v1, v2, (const int  *)v3, (const int  *)v4, (int)v5, (MPI_Datatype) the_real_datatype, &(v7.the_real_datatype) ));
+        return v7;
+    }
+    virtual void Unpack_external( const char v1[], const void * v2, const Aint v3, Aint & v4, void * v5, int v6 ) const
+    {
+        MPIX_CALLWORLD( MPI_Unpack_external( v1, (const void *)v2, (MPI_Aint)v3, &v4, v5, v6, (MPI_Datatype) the_real_datatype ));
+    }
+    static void Free_keyval( int &v1 ) 
+    {
+        MPIX_CALLWORLD( MPI_Type_free_keyval( &v1 ));
+    }
+    static Datatype Create_struct( int v1, const int v2[], const Aint v3[],  const Datatype v4[] ) 
+    {
+        Datatype v5;
+        MPI_Datatype *l4 = new MPI_Datatype[v1];
+        { 
+            int i4; 
+            for (i4=0;i4<v1;i4++) {
+                l4[i4] = v4[i4].the_real_datatype;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Type_create_struct( v1, (const int  *)v2, (const MPI_Aint  *)v3, l4, &(v5.the_real_datatype) ));
+                    delete[] l4;
+        return v5;
+    }
+    static Datatype Create_f90_integer( int v1 ) 
+    {
+        Datatype v2;
+        MPIX_CALLWORLD( MPI_Type_create_f90_integer( v1, &(v2.the_real_datatype) ));
+        return v2;
+    }
+    virtual void Pack_external( const char v1[], const void * v2, const int v3, void * v5, Aint v6, Aint & v7 ) const
+    {
+        MPIX_CALLWORLD( MPI_Pack_external( v1, (const void *)v2, (int)v3, (MPI_Datatype) the_real_datatype, v5, v6, &v7 ));
+    }
+    virtual void Get_extent( Aint & v2, Aint & v3 ) const
+    {
+        MPIX_CALLWORLD( MPI_Type_get_extent( (MPI_Datatype) the_real_datatype, &v2, &v3 ));
+    }
+    virtual void Delete_attr( int v2 ) 
+    {
+        MPIX_CALLWORLD( MPI_Type_delete_attr( (MPI_Datatype) the_real_datatype, v2 ));
+    }
+    virtual void Get_name( char * v2, int &v3 ) const
+    {
+    MPIR_CXX_InitDatatypeNames();
+        MPIX_CALLWORLD( MPI_Type_get_name( (MPI_Datatype) the_real_datatype, v2, &v3 ));
+    }
+
+    void Unpack( const void *, int, void *, int, int &, const Comm & ) const;
+    typedef int Copy_attr_function(const Datatype& oldtype, int type_keyval, void* extra_state, void* attribute_val_in, void* attribute_val_out, bool& flag); 
+    typedef int Delete_attr_function(Datatype& type, int type_keyval, void* attribute_val, void* extra_state); 
+
+    static int Create_keyval( Copy_attr_function *, Delete_attr_function *,
+                              void * );
+    // These functions are *not* part of MPI-2 but are provided
+    // because they should have been included
+    static int NULL_COPY_FN( const Datatype &oldtype MPIR_ARGUNUSED, 
+        int keyval MPIR_ARGUNUSED, void *ex MPIR_ARGUNUSED,
+        void *attr_in MPIR_ARGUNUSED, void *attr_out MPIR_ARGUNUSED, 
+        bool &flag ) { flag = 1; return 0;}
+    static int NULL_DELETE_FN( Datatype &type MPIR_ARGUNUSED, 
+        int keyval MPIR_ARGUNUSED, void * attr MPIR_ARGUNUSED, 
+        void *ex MPIR_ARGUNUSED ) { return 0; }
+    static int DUP_FN( const Datatype &oldtype MPIR_ARGUNUSED, 
+        int keyval MPIR_ARGUNUSED, void *ex MPIR_ARGUNUSED,
+        void *attr_in, void *attr_out, bool &flag ) { flag = 1; 
+            *(void **)attr_out = attr_in; return 0;} 
+          
+};
+
+    typedef void User_function(const void *, void*, int, const Datatype&); 
+
+class Info  {
+    friend class File;
+    friend class Win;
+    friend class Comm;
+    friend class Intracomm;
+
+  protected:
+    MPI_Info the_real_info;
+
+  public:
+    // new/delete
+
+    inline Info(MPI_Info obj) : the_real_info(obj) {}
+    inline Info(void) : the_real_info(MPI_INFO_NULL) {}
+
+    virtual ~Info() {}
+    // copy/assignment
+
+    Info(const Info &obj) : the_real_info(obj.the_real_info){}
+
+    Info& operator=(const Info &obj) {
+      the_real_info = obj.the_real_info; return *this; }
+
+    // logical
+    bool operator== (const Info &obj) {
+      return (the_real_info == obj.the_real_info); }
+    bool operator!= (const Info &obj) {
+      return (the_real_info != obj.the_real_info); }
+    // C/C++ cast and assignment
+    inline operator MPI_Info*() { return &the_real_info; }
+    inline operator MPI_Info() const { return the_real_info; }
+    Info& operator=(const MPI_Info& obj) {
+      the_real_info = obj; return *this; }
+    virtual void Delete( const char * v2 ) 
+    {
+        MPIX_CALLWORLD( MPI_Info_delete( (MPI_Info) the_real_info, (const char *)v2 ));
+    }
+    virtual void Get_nthkey( int v2, char * v3 ) const
+    {
+        MPIX_CALLWORLD( MPI_Info_get_nthkey( (MPI_Info) the_real_info, v2, v3 ));
+    }
+    virtual void Free( void ) 
+    {
+        MPIX_CALLWORLD( MPI_Info_free( (MPI_Info *) &the_real_info ));
+    }
+    static Info Create( void ) 
+    {
+        Info v1;
+        MPIX_CALLWORLD( MPI_Info_create( &(v1.the_real_info) ));
+        return v1;
+    }
+    virtual void Set( const char * v2, const char * v3 ) 
+    {
+        MPIX_CALLWORLD( MPI_Info_set( (MPI_Info) the_real_info, (const char *)v2, (const char *)v3 ));
+    }
+    Info Dup( void ) const
+    {
+        Info v2;
+        MPIX_CALLWORLD( MPI_Info_dup( (MPI_Info) the_real_info, &(v2.the_real_info) ));
+        return v2;
+    }
+    virtual bool Get_valuelen( const char * v2, int &v3 ) const
+    {
+        int v4;
+        MPIX_CALLWORLD( MPI_Info_get_valuelen( (MPI_Info) the_real_info, (const char *)v2, &v3, &v4 ));
+        return v4!= 0;
+    }
+    virtual bool Get( const char * v2, int v3, char * v4 ) const
+    {
+        int v5;
+        MPIX_CALLWORLD( MPI_Info_get( (MPI_Info) the_real_info, (const char *)v2, v3, v4, &v5 ));
+        return v5!= 0;
+    }
+    virtual int Get_nkeys( void ) const
+    {
+        int v2;
+        MPIX_CALLWORLD( MPI_Info_get_nkeys( (MPI_Info) the_real_info, &v2 ));
+        return v2;
+    }
+};
+
+class Status  {
+    friend class Comm;
+    friend class File;
+    friend class Request;
+
+  protected:
+    MPI_Status the_real_status;
+
+  public:
+    // new/delete
+
+    inline Status(MPI_Status obj) : the_real_status(obj) {}
+    inline Status(void) : the_real_status() {}
+
+    virtual ~Status() {}
+    // copy/assignment
+
+    Status(const Status &obj) : the_real_status(obj.the_real_status){}
+
+    Status& operator=(const Status &obj) {
+      the_real_status = obj.the_real_status; return *this; }
+
+    // C/C++ cast and assignment
+    inline operator MPI_Status*() { return &the_real_status; }
+    inline operator MPI_Status() const { return the_real_status; }
+    Status& operator=(const MPI_Status& obj) {
+      the_real_status = obj; return *this; }
+    virtual bool Is_cancelled( void ) const
+    {
+        int v2;
+        MPIX_CALLWORLD( MPI_Test_cancelled( (const MPI_Status *) &the_real_status, &v2 ));
+        return v2!= 0;
+    }
+    virtual int Get_elements( const Datatype &v2 ) const
+    {
+        int v3;
+        MPIX_CALLWORLD( MPI_Get_elements( (const MPI_Status *) &the_real_status, (MPI_Datatype)(v2.the_real_datatype), &v3 ));
+        return v3;
+    }
+    virtual int Get_count( const Datatype &v2 ) const
+    {
+        int v3;
+        MPIX_CALLWORLD( MPI_Get_count( (const MPI_Status *) &the_real_status, (MPI_Datatype)(v2.the_real_datatype), &v3 ));
+        return v3;
+    }
+    virtual void Set_cancelled( bool v2 ) 
+    {
+        int l2;
+         l2 = (v2 == true) ? 1 : 0;
+        MPIX_CALLWORLD( MPI_Status_set_cancelled( (MPI_Status *) &the_real_status, l2 ));
+    }
+    virtual void Set_elements( const Datatype &v2, int v3 ) 
+    {
+        MPIX_CALLWORLD( MPI_Status_set_elements( (MPI_Status *) &the_real_status, (MPI_Datatype)(v2.the_real_datatype), v3 ));
+    }
+
+    int Get_source(void) const { return the_real_status.MPI_SOURCE; }
+    int Get_tag(void) const { return the_real_status.MPI_TAG; }
+    int Get_error(void) const { return the_real_status.MPI_ERROR; }
+    void Set_source(int source) { the_real_status.MPI_SOURCE = source; }
+    void Set_tag(int tag) { the_real_status.MPI_TAG = tag; }
+    void Set_error(int error) { the_real_status.MPI_ERROR = error; }
+};
+
+class Group  {
+    friend class Comm;
+    friend class Intracomm;
+    friend class Intercomm;
+    friend class Win;
+    friend class File;
+
+  protected:
+    MPI_Group the_real_group;
+
+  public:
+    // new/delete
+
+    inline Group(MPI_Group obj) : the_real_group(obj) {}
+    inline Group(void) : the_real_group(MPI_GROUP_NULL) {}
+
+    virtual ~Group() {}
+    // copy/assignment
+
+    Group(const Group &obj) : the_real_group(obj.the_real_group){}
+
+    Group& operator=(const Group &obj) {
+      the_real_group = obj.the_real_group; return *this; }
+
+    // logical
+    bool operator== (const Group &obj) {
+      return (the_real_group == obj.the_real_group); }
+    bool operator!= (const Group &obj) {
+      return (the_real_group != obj.the_real_group); }
+    // C/C++ cast and assignment
+    inline operator MPI_Group*() { return &the_real_group; }
+    inline operator MPI_Group() const { return the_real_group; }
+    Group& operator=(const MPI_Group& obj) {
+      the_real_group = obj; return *this; }
+    virtual Group Excl( int v2, const int v3[] ) const
+    {
+        Group v4;
+        MPIX_CALLWORLD( MPI_Group_excl( (MPI_Group) the_real_group, v2, (const int  *)v3, &(v4.the_real_group) ));
+        return v4;
+    }
+    virtual int Get_rank( void ) const
+    {
+        int v2;
+        MPIX_CALLWORLD( MPI_Group_rank( (MPI_Group) the_real_group, &v2 ));
+        return v2;
+    }
+    virtual void Free( void ) 
+    {
+        MPIX_CALLWORLD( MPI_Group_free( (MPI_Group *) &the_real_group ));
+    }
+    static Group Union( const Group &v1, const Group &v2 ) 
+    {
+        Group v3;
+        MPIX_CALLWORLD( MPI_Group_union( (MPI_Group)(v1.the_real_group), (MPI_Group)(v2.the_real_group), &(v3.the_real_group) ));
+        return v3;
+    }
+    static Group Intersect( const Group &v1, const Group &v2 ) 
+    {
+        Group v3;
+        MPIX_CALLWORLD( MPI_Group_intersection( (MPI_Group)(v1.the_real_group), (MPI_Group)(v2.the_real_group), &(v3.the_real_group) ));
+        return v3;
+    }
+    virtual Group Range_excl( int v2, const int v3[][3] ) const
+    {
+        Group v4;
+        MPIX_CALLWORLD( MPI_Group_range_excl( (MPI_Group) the_real_group, v2, (int  (*)[3])v3, &(v4.the_real_group) ));
+        return v4;
+    }
+    virtual Group Range_incl( int v2, const int v3[][3] ) const
+    {
+        Group v4;
+        MPIX_CALLWORLD( MPI_Group_range_incl( (MPI_Group) the_real_group, v2, (int  (*)[3])v3, &(v4.the_real_group) ));
+        return v4;
+    }
+    static Group Difference( const Group &v1, const Group &v2 ) 
+    {
+        Group v3;
+        MPIX_CALLWORLD( MPI_Group_difference( (MPI_Group)(v1.the_real_group), (MPI_Group)(v2.the_real_group), &(v3.the_real_group) ));
+        return v3;
+    }
+    static void Translate_ranks( const Group &v1, int v2, const int v3[], const Group &v4, int v5[] ) 
+    {
+        MPIX_CALLWORLD( MPI_Group_translate_ranks( (MPI_Group)(v1.the_real_group), v2, (const int  *)v3, (MPI_Group)(v4.the_real_group), v5 ));
+    }
+    virtual Group Incl( int v2, const int v3[] ) const
+    {
+        Group v4;
+        MPIX_CALLWORLD( MPI_Group_incl( (MPI_Group) the_real_group, v2, (const int  *)v3, &(v4.the_real_group) ));
+        return v4;
+    }
+    virtual int Get_size( void ) const
+    {
+        int v2;
+        MPIX_CALLWORLD( MPI_Group_size( (MPI_Group) the_real_group, &v2 ));
+        return v2;
+    }
+    static int Compare( const Group &v1, const Group &v2 ) 
+    {
+        int v3;
+        MPIX_CALLWORLD( MPI_Group_compare( (MPI_Group)(v1.the_real_group), (MPI_Group)(v2.the_real_group), &v3 ));
+        return v3;
+    }
+};
+
+class Op  {
+    friend class Intracomm;
+    friend class Intercomm;
+    friend class Win;
+    friend class Comm;
+
+  protected:
+    MPI_Op the_real_op;
+
+  public:
+    // new/delete
+
+    inline Op(MPI_Op obj) : the_real_op(obj) {}
+    inline Op(void) : the_real_op(MPI_OP_NULL) {}
+
+    virtual ~Op() {}
+    // copy/assignment
+
+    Op(const Op &obj) : the_real_op(obj.the_real_op){}
+
+    Op& operator=(const Op &obj) {
+      the_real_op = obj.the_real_op; return *this; }
+
+    // logical
+    bool operator== (const Op &obj) {
+      return (the_real_op == obj.the_real_op); }
+    bool operator!= (const Op &obj) {
+      return (the_real_op != obj.the_real_op); }
+    // C/C++ cast and assignment
+    inline operator MPI_Op*() { return &the_real_op; }
+    inline operator MPI_Op() const { return the_real_op; }
+    Op& operator=(const MPI_Op& obj) {
+      the_real_op = obj; return *this; }
+    virtual void Free( void ) 
+    {
+        MPIX_CALLWORLD( MPI_Op_free( (MPI_Op *) &the_real_op ));
+    }
+    virtual bool Is_commutative( void ) const
+    {
+        int v2;
+        MPIX_CALLWORLD( MPI_Op_commutative( (MPI_Op) the_real_op, &v2 ));
+        return v2!= 0;
+    }
+    virtual void Reduce_local( const void * v1, void * v2, int v3, const Datatype &v4 ) const
+    {
+        MPIX_CALLWORLD( MPI_Reduce_local( (const void *)v1, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Op) the_real_op ));
+    }
+
+    void Init( User_function *, bool );
+};
+
+class Errhandler  {
+    friend class Comm;
+    friend class File;
+    friend class Win;
+
+  protected:
+    MPI_Errhandler the_real_errhandler;
+
+  public:
+    // new/delete
+
+    inline Errhandler(MPI_Errhandler obj) : the_real_errhandler(obj) {}
+    inline Errhandler(void) : the_real_errhandler(MPI_ERRHANDLER_NULL) {}
+
+    virtual ~Errhandler() {}
+    // copy/assignment
+
+    Errhandler(const Errhandler &obj) : the_real_errhandler(obj.the_real_errhandler){}
+
+    Errhandler& operator=(const Errhandler &obj) {
+      the_real_errhandler = obj.the_real_errhandler; return *this; }
+
+    // logical
+    bool operator== (const Errhandler &obj) {
+      return (the_real_errhandler == obj.the_real_errhandler); }
+    bool operator!= (const Errhandler &obj) {
+      return (the_real_errhandler != obj.the_real_errhandler); }
+    // C/C++ cast and assignment
+    inline operator MPI_Errhandler*() { return &the_real_errhandler; }
+    inline operator MPI_Errhandler() const { return the_real_errhandler; }
+    Errhandler& operator=(const MPI_Errhandler& obj) {
+      the_real_errhandler = obj; return *this; }
+    virtual void Free( void ) 
+    {
+        MPIX_CALLWORLD( MPI_Errhandler_free( (MPI_Errhandler *) &the_real_errhandler ));
+    }
+};
+
+class Request  {
+    friend class Comm;
+    friend class File;
+    friend class Grequest;
+
+  protected:
+    MPI_Request the_real_request;
+
+  public:
+    // new/delete
+
+    inline Request(MPI_Request obj) : the_real_request(obj) {}
+    inline Request(void) : the_real_request(MPI_REQUEST_NULL) {}
+
+    virtual ~Request() {}
+    // copy/assignment
+
+    Request(const Request &obj) : the_real_request(obj.the_real_request){}
+
+    Request& operator=(const Request &obj) {
+      the_real_request = obj.the_real_request; return *this; }
+
+    // logical
+    bool operator== (const Request &obj) {
+      return (the_real_request == obj.the_real_request); }
+    bool operator!= (const Request &obj) {
+      return (the_real_request != obj.the_real_request); }
+    // C/C++ cast and assignment
+    inline operator MPI_Request*() { return &the_real_request; }
+    inline operator MPI_Request() const { return the_real_request; }
+    Request& operator=(const MPI_Request& obj) {
+      the_real_request = obj; return *this; }
+    static bool Testany( int v1, Request v2[], int &v3, Status & v5 ) 
+    {
+        int v4;
+        MPI_Request *l2 = new MPI_Request[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Testany( v1, l2, &v3, &v4, (MPI_Status *)&(v5.the_real_status ) ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        return v4!= 0;
+    }
+    static bool Testany( int v1, Request v2[], int &v3 ) 
+    {
+        int v4;
+        MPI_Request *l2 = new MPI_Request[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Testany( v1, l2, &v3, &v4, MPI_STATUS_IGNORE ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        return v4!= 0;
+    }
+    static int Waitsome( int v1, Request v2[], int v4[], Status v5[] ) 
+    {
+        int v3;
+        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Status *l5 = new MPI_Status[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Waitsome( v1, l2, &v3, v4, l5 ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        { 
+            int i5; 
+            for (i5=0;i5<v1;i5++) {
+                v5[i5].the_real_status = l5[i5];
+            }
+            delete[] l5;
+        }
+        return v3;
+    }
+    static int Waitsome( int v1, Request v2[], int v4[] ) 
+    {
+        int v3;
+        MPI_Request *l2 = new MPI_Request[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Waitsome( v1, l2, &v3, v4, MPI_STATUSES_IGNORE ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        return v3;
+    }
+    virtual void Free( void ) 
+    {
+        MPIX_CALLWORLD( MPI_Request_free( (MPI_Request *) &the_real_request ));
+    }
+    static bool Testall( int v1, Request v2[], Status v4[] ) 
+    {
+        int v3;
+        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Status *l4 = new MPI_Status[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Testall( v1, l2, &v3, l4 ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        { 
+            int i4; 
+            for (i4=0;i4<v1;i4++) {
+                v4[i4].the_real_status = l4[i4];
+            }
+            delete[] l4;
+        }
+        return v3!= 0;
+    }
+    static bool Testall( int v1, Request v2[] ) 
+    {
+        int v3;
+        MPI_Request *l2 = new MPI_Request[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Testall( v1, l2, &v3, MPI_STATUSES_IGNORE ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        return v3!= 0;
+    }
+    virtual void Wait( Status & v2 ) 
+    {
+        MPIX_CALLWORLD( MPI_Wait( (MPI_Request *) &the_real_request, (MPI_Status *)&(v2.the_real_status ) ));
+    }
+    virtual void Wait( void ) 
+    {
+        MPIX_CALLWORLD( MPI_Wait( (MPI_Request *) &the_real_request, MPI_STATUS_IGNORE ));
+    }
+    static int Testsome( int v1, Request v2[], int v4[], Status v5[] ) 
+    {
+        int v3;
+        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Status *l5 = new MPI_Status[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Testsome( v1, l2, &v3, v4, l5 ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        { 
+            int i5; 
+            for (i5=0;i5<v1;i5++) {
+                v5[i5].the_real_status = l5[i5];
+            }
+            delete[] l5;
+        }
+        return v3;
+    }
+    static int Testsome( int v1, Request v2[], int v4[] ) 
+    {
+        int v3;
+        MPI_Request *l2 = new MPI_Request[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Testsome( v1, l2, &v3, v4, MPI_STATUSES_IGNORE ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        return v3;
+    }
+    static void Waitall( int v1, Request v2[], Status v3[] ) 
+    {
+        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Status *l3 = new MPI_Status[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Waitall( v1, l2, l3 ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        { 
+            int i3; 
+            for (i3=0;i3<v1;i3++) {
+                v3[i3].the_real_status = l3[i3];
+            }
+            delete[] l3;
+        }
+    }
+    static void Waitall( int v1, Request v2[] ) 
+    {
+        MPI_Request *l2 = new MPI_Request[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Waitall( v1, l2, MPI_STATUSES_IGNORE ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+    }
+    static int Waitany( int v1, Request v2[], Status & v4 ) 
+    {
+        int v3;
+        MPI_Request *l2 = new MPI_Request[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Waitany( v1, l2, &v3, (MPI_Status *)&(v4.the_real_status ) ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        return v3;
+    }
+    static int Waitany( int v1, Request v2[] ) 
+    {
+        int v3;
+        MPI_Request *l2 = new MPI_Request[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Waitany( v1, l2, &v3, MPI_STATUS_IGNORE ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+        return v3;
+    }
+    virtual bool Test( Status & v3 ) 
+    {
+        int v2;
+        MPIX_CALLWORLD( MPI_Test( (MPI_Request *) &the_real_request, &v2, (MPI_Status *)&(v3.the_real_status ) ));
+        return v2!= 0;
+    }
+    virtual bool Test( void ) 
+    {
+        int v2;
+        MPIX_CALLWORLD( MPI_Test( (MPI_Request *) &the_real_request, &v2, MPI_STATUS_IGNORE ));
+        return v2!= 0;
+    }
+    virtual void Cancel( void ) const
+    {
+        MPIX_CALLWORLD( MPI_Cancel( (MPI_Request *) &the_real_request ));
+    }
+    virtual bool Get_status( Status & v3 ) const
+    {
+        int v2;
+        MPIX_CALLWORLD( MPI_Request_get_status( (MPI_Request) the_real_request, &v2, (MPI_Status *)&(v3.the_real_status ) ));
+        return v2!= 0;
+    }
+    virtual bool Get_status( void ) const
+    {
+        int v2;
+        MPIX_CALLWORLD( MPI_Request_get_status( (MPI_Request) the_real_request, &v2, MPI_STATUS_IGNORE ));
+        return v2!= 0;
+    }
+};
+
+class Prequest : public Request {
+
+  public:
+    // new/delete
+
+    inline Prequest(MPI_Request obj) : Request(obj) {}
+    inline Prequest(void) : Request() {}
+
+    virtual ~Prequest() {}
+    // copy/assignment
+
+    Prequest(const Prequest &obj) : Request(obj) {}
+
+    Prequest& operator=(const Prequest &obj) {
+      the_real_request = obj.the_real_request; return *this; }
+
+    // C/C++ cast and assignment
+    inline operator MPI_Request*() { return &the_real_request; }
+    inline operator MPI_Request() const { return the_real_request; }
+    Prequest& operator=(const MPI_Request& obj) {
+      the_real_request = obj; return *this; }
+    virtual void Start( void ) 
+    {
+        MPIX_CALLWORLD( MPI_Start( (MPI_Request *) &the_real_request ));
+    }
+    static void Startall( int v1, Prequest v2[] ) 
+    {
+        MPI_Request *l2 = new MPI_Request[v1];
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                l2[i2] = v2[i2].the_real_request;
+            }
+        }
+        MPIX_CALLWORLD( MPI_Startall( v1, l2 ));
+        { 
+            int i2; 
+            for (i2=0;i2<v1;i2++) {
+                v2[i2].the_real_request = l2[i2];
+            }
+            delete[] l2;
+        }
+    }
+};
+
+class Comm  {
+    friend class Cartcomm;
+    friend class Intercomm;
+    friend class Intracomm;
+    friend class Graphcomm;
+    friend class Nullcomm;
+    friend class Datatype;
+    friend class Win;
+    friend class File;
+
+  protected:
+    MPI_Comm the_real_comm;
+
+  public:
+    // new/delete
+
+    inline Comm(MPI_Comm obj) : the_real_comm(obj) {}
+    inline Comm(void) : the_real_comm(MPI_COMM_NULL) {}
+
+    virtual ~Comm() {}
+    // copy/assignment
+
+    Comm(const Comm &obj) : the_real_comm(obj.the_real_comm){}
+
+    Comm& operator=(const Comm &obj) {
+      the_real_comm = obj.the_real_comm; return *this; }
+
+    // logical
+    bool operator== (const Comm &obj) {
+      return (the_real_comm == obj.the_real_comm); }
+    bool operator!= (const Comm &obj) {
+      return (the_real_comm != obj.the_real_comm); }
+    // C/C++ cast and assignment
+    inline operator MPI_Comm*() { return &the_real_comm; }
+    inline operator MPI_Comm() const { return the_real_comm; }
+    Comm& operator=(const MPI_Comm& obj) {
+      the_real_comm = obj; return *this; }
+    virtual Group Get_group( void ) const
+    {
+        Group v2;
+        MPIX_CALLREF( this, MPI_Comm_group( (MPI_Comm) the_real_comm, &(v2.the_real_group) ));
+        return v2;
+    }
+    virtual int Get_rank( void ) const
+    {
+        int v2;
+        MPIX_CALLREF( this, MPI_Comm_rank( (MPI_Comm) the_real_comm, &v2 ));
+        return v2;
+    }
+    virtual Prequest Bsend_init( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        Prequest v7;
+        MPIX_CALLREF( this, MPI_Bsend_init( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, &(v7.the_real_request) ));
+        return v7;
+    }
+    virtual Prequest Ssend_init( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        Prequest v7;
+        MPIX_CALLREF( this, MPI_Ssend_init( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, &(v7.the_real_request) ));
+        return v7;
+    }
+    virtual bool Is_inter( void ) const
+    {
+        int v2;
+        MPIX_CALLREF( this, MPI_Comm_test_inter( (MPI_Comm) the_real_comm, &v2 ));
+        return v2!= 0;
+    }
+    virtual Prequest Rsend_init( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        Prequest v7;
+        MPIX_CALLREF( this, MPI_Rsend_init( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, &(v7.the_real_request) ));
+        return v7;
+    }
+    virtual Request Ibsend( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        Request v7;
+        MPIX_CALLREF( this, MPI_Ibsend( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, &(v7.the_real_request) ));
+        return v7;
+    }
+    virtual void Abort( int v2 ) const
+    {
+        MPIX_CALLREF( this, MPI_Abort( (MPI_Comm) the_real_comm, v2 ));
+    }
+    virtual void Free( void ) 
+    {
+        MPIX_CALLREF( this, MPI_Comm_free( (MPI_Comm *) &the_real_comm ));
+    }
+    virtual Prequest Send_init( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        Prequest v7;
+        MPIX_CALLREF( this, MPI_Send_init( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, &(v7.the_real_request) ));
+        return v7;
+    }
+    virtual void Recv( void * v1, int v2, const Datatype &v3, int v4, int v5, Status & v7 ) const
+    {
+        MPIX_CALLREF( this, MPI_Recv( v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, (MPI_Status *)&(v7.the_real_status ) ));
+    }
+    virtual void Recv( void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Recv( v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, MPI_STATUS_IGNORE ));
+    }
+    virtual void Sendrecv( const void * v1, int v2, const Datatype &v3, int v4, int v5, void * v6, int v7, const Datatype &v8, int v9, int v10, Status & v12 ) const
+    {
+        MPIX_CALLREF( this, MPI_Sendrecv( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, v6, v7, (MPI_Datatype)(v8.the_real_datatype), v9, v10, (MPI_Comm) the_real_comm, (MPI_Status *)&(v12.the_real_status ) ));
+    }
+    virtual void Sendrecv( const void * v1, int v2, const Datatype &v3, int v4, int v5, void * v6, int v7, const Datatype &v8, int v9, int v10 ) const
+    {
+        MPIX_CALLREF( this, MPI_Sendrecv( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, v6, v7, (MPI_Datatype)(v8.the_real_datatype), v9, v10, (MPI_Comm) the_real_comm, MPI_STATUS_IGNORE ));
+    }
+    virtual void Sendrecv_replace( void * v1, int v2, const Datatype &v3, int v4, int v5, int v6, int v7, Status & v9 ) const
+    {
+        MPIX_CALLREF( this, MPI_Sendrecv_replace( v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, v6, v7, (MPI_Comm) the_real_comm, (MPI_Status *)&(v9.the_real_status ) ));
+    }
+    virtual void Sendrecv_replace( void * v1, int v2, const Datatype &v3, int v4, int v5, int v6, int v7 ) const
+    {
+        MPIX_CALLREF( this, MPI_Sendrecv_replace( v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, v6, v7, (MPI_Comm) the_real_comm, MPI_STATUS_IGNORE ));
+    }
+    virtual int Get_topology( void ) const
+    {
+        int v2;
+        MPIX_CALLREF( this, MPI_Topo_test( (MPI_Comm) the_real_comm, &v2 ));
+        return v2;
+    }
+    virtual Request Isend( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        Request v7;
+        MPIX_CALLREF( this, MPI_Isend( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, &(v7.the_real_request) ));
+        return v7;
+    }
+    virtual void Probe( int v1, int v2, Status & v4 ) const
+    {
+        MPIX_CALLREF( this, MPI_Probe( v1, v2, (MPI_Comm) the_real_comm, (MPI_Status *)&(v4.the_real_status ) ));
+    }
+    virtual void Probe( int v1, int v2 ) const
+    {
+        MPIX_CALLREF( this, MPI_Probe( v1, v2, (MPI_Comm) the_real_comm, MPI_STATUS_IGNORE ));
+    }
+    static int Compare( const Comm &v1, const Comm &v2 ) 
+    {
+        int v3;
+        MPIX_CALLWORLD( MPI_Comm_compare( (MPI_Comm)(v1.the_real_comm), (MPI_Comm)(v2.the_real_comm), &v3 ));
+        return v3;
+    }
+    virtual int Get_size( void ) const
+    {
+        int v2;
+        MPIX_CALLREF( this, MPI_Comm_size( (MPI_Comm) the_real_comm, &v2 ));
+        return v2;
+    }
+    virtual Request Issend( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        Request v7;
+        MPIX_CALLREF( this, MPI_Issend( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, &(v7.the_real_request) ));
+        return v7;
+    }
+    virtual void Set_errhandler( const Errhandler &v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_Comm_set_errhandler( (MPI_Comm) the_real_comm, (MPI_Errhandler)(v2.the_real_errhandler) ));
+    }
+    virtual void Send( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Send( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm ));
+    }
+    virtual Request Irsend( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        Request v7;
+        MPIX_CALLREF( this, MPI_Irsend( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, &(v7.the_real_request) ));
+        return v7;
+    }
+    virtual void Ssend( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Ssend( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm ));
+    }
+    virtual Prequest Recv_init( void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        Prequest v7;
+        MPIX_CALLREF( this, MPI_Recv_init( v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, &(v7.the_real_request) ));
+        return v7;
+    }
+    virtual bool Iprobe( int v1, int v2, Status & v5 ) const
+    {
+        int v4;
+        MPIX_CALLREF( this, MPI_Iprobe( v1, v2, (MPI_Comm) the_real_comm, &v4, (MPI_Status *)&(v5.the_real_status ) ));
+        return v4!= 0;
+    }
+    virtual bool Iprobe( int v1, int v2 ) const
+    {
+        int v4;
+        MPIX_CALLREF( this, MPI_Iprobe( v1, v2, (MPI_Comm) the_real_comm, &v4, MPI_STATUS_IGNORE ));
+        return v4!= 0;
+    }
+    virtual void Bsend( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Bsend( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm ));
+    }
+    virtual Request Irecv( void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        Request v7;
+        MPIX_CALLREF( this, MPI_Irecv( v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm, &(v7.the_real_request) ));
+        return v7;
+    }
+    virtual Errhandler Get_errhandler( void ) const
+    {
+        Errhandler v2;
+        MPIX_CALLREF( this, MPI_Comm_get_errhandler( (MPI_Comm) the_real_comm, &(v2.the_real_errhandler) ));
+        return v2;
+    }
+    virtual void Rsend( const void * v1, int v2, const Datatype &v3, int v4, int v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Rsend( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm ));
+    }
+    virtual void Gatherv( const void * v1, int v2, const Datatype &v3, void * v4, const int * v5, const int * v6, const Datatype &v7, int v8 ) const
+    {
+        MPIX_CALLREF( this, MPI_Gatherv( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, (const int *)v5, (const int *)v6, (MPI_Datatype)(v7.the_real_datatype), v8, (MPI_Comm) the_real_comm ));
+    }
+    virtual void Disconnect( void ) 
+    {
+        MPIX_CALLREF( this, MPI_Comm_disconnect( (MPI_Comm *) &the_real_comm ));
+    }
+    virtual void Allreduce( const void * v1, void * v2, int v3, const Datatype &v4, const Op &v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Allreduce( (const void *)v1, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Op)(v5.the_real_op), (MPI_Comm) the_real_comm ));
+    }
+    virtual void Alltoallw( const void * v1, const int v2[], const int v3[],  const Datatype v4[], void * v5, const int v6[], const int v7[],  const Datatype v8[] ) const
+    {
+        MPI_Datatype *l4 = new MPI_Datatype[Get_size()];
+        MPI_Datatype *l8 = new MPI_Datatype[Get_size()];
+        { 
+            int i4; 
+            for (i4=0;i4<Get_size();i4++) {
+                l4[i4] = v4[i4].the_real_datatype;
+            }
+        }
+        { 
+            int i8; 
+            for (i8=0;i8<Get_size();i8++) {
+                l8[i8] = v8[i8].the_real_datatype;
+            }
+        }
+        MPIX_CALLREF( this, MPI_Alltoallw( (const void *)v1, (const int  *)v2, (const int  *)v3, l4, v5, (const int  *)v6, (const int  *)v7, l8, (MPI_Comm) the_real_comm ));
+                    delete[] l4;
+                    delete[] l8;
+    }
+    static Intercomm Join( const int v1 ) ;
+    virtual void Alltoall( const void * v1, int v2, const Datatype &v3, void * v4, int v5, const Datatype &v6 ) const
+    {
+        MPIX_CALLREF( this, MPI_Alltoall( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Datatype)(v6.the_real_datatype), (MPI_Comm) the_real_comm ));
+    }
+    virtual bool Get_attr( int v2, void * v3 ) const
+    {
+        int v4;
+        MPIX_CALLREF( this, MPI_Comm_get_attr( (MPI_Comm) the_real_comm, v2, v3, &v4 ));
+        return v4!= 0;
+    }
+    virtual void Barrier( void ) const
+    {
+        MPIX_CALLREF( this, MPI_Barrier( (MPI_Comm) the_real_comm ));
+    }
+    virtual void Bcast( void * v1, int v2, const Datatype &v3, int v4 ) const
+    {
+        MPIX_CALLREF( this, MPI_Bcast( v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, (MPI_Comm) the_real_comm ));
+    }
+    virtual void Set_attr( int v2, const void * v3 ) 
+    {
+        MPIX_CALLREF( this, MPI_Comm_set_attr( (MPI_Comm) the_real_comm, v2, (void *)v3 ));
+    }
+    virtual void Set_name( const char * v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_Comm_set_name( (MPI_Comm) the_real_comm, (const char *)v2 ));
+    }
+    static Intercomm Get_parent( void ) ;
+    virtual void Alltoallv( const void * v1, const int * v2, const int * v3, const Datatype &v4, void * v5, const int * v6, const int * v7, const Datatype &v8 ) const
+    {
+        MPIX_CALLREF( this, MPI_Alltoallv( (const void *)v1, (const int *)v2, (const int *)v3, (MPI_Datatype)(v4.the_real_datatype), v5, (const int *)v6, (const int *)v7, (MPI_Datatype)(v8.the_real_datatype), (MPI_Comm) the_real_comm ));
+    }
+    virtual void Reduce_scatter( const void * v1, void * v2, const int v3[], const Datatype &v4, const Op &v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Reduce_scatter( (const void *)v1, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Op)(v5.the_real_op), (MPI_Comm) the_real_comm ));
+    }
+    virtual void Scatter( const void * v1, int v2, const Datatype &v3, void * v4, int v5, const Datatype &v6, int v7 ) const
+    {
+        MPIX_CALLREF( this, MPI_Scatter( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Datatype)(v6.the_real_datatype), v7, (MPI_Comm) the_real_comm ));
+    }
+    virtual void Gather( const void * v1, int v2, const Datatype &v3, void * v4, int v5, const Datatype &v6, int v7 ) const
+    {
+        MPIX_CALLREF( this, MPI_Gather( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Datatype)(v6.the_real_datatype), v7, (MPI_Comm) the_real_comm ));
+    }
+    static void Free_keyval( int &v1 ) 
+    {
+        MPIX_CALLWORLD( MPI_Comm_free_keyval( &v1 ));
+    }
+    virtual void Reduce( const void * v1, void * v2, int v3, const Datatype &v4, const Op &v5, int v6 ) const
+    {
+        MPIX_CALLREF( this, MPI_Reduce( (const void *)v1, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Op)(v5.the_real_op), v6, (MPI_Comm) the_real_comm ));
+    }
+    virtual void Allgather( const void * v1, int v2, const Datatype &v3, void * v4, int v5, const Datatype &v6 ) const
+    {
+        MPIX_CALLREF( this, MPI_Allgather( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Datatype)(v6.the_real_datatype), (MPI_Comm) the_real_comm ));
+    }
+    virtual void Delete_attr( int v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_Comm_delete_attr( (MPI_Comm) the_real_comm, v2 ));
+    }
+    virtual void Scatterv( const void * v1, const int * v2, const int * v3, const Datatype &v4, void * v5, int v6, const Datatype &v7, int v8 ) const
+    {
+        MPIX_CALLREF( this, MPI_Scatterv( (const void *)v1, (const int *)v2, (const int *)v3, (MPI_Datatype)(v4.the_real_datatype), v5, v6, (MPI_Datatype)(v7.the_real_datatype), v8, (MPI_Comm) the_real_comm ));
+    }
+    virtual void Get_name( char * v2, int &v3 ) const
+    {
+        MPIX_CALLREF( this, MPI_Comm_get_name( (MPI_Comm) the_real_comm, v2, &v3 ));
+    }
+    virtual void Allgatherv( const void * v1, int v2, const Datatype &v3, void * v4, const int * v5, const int * v6, const Datatype &v7 ) const
+    {
+        MPIX_CALLREF( this, MPI_Allgatherv( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, (const int *)v5, (const int *)v6, (MPI_Datatype)(v7.the_real_datatype), (MPI_Comm) the_real_comm ));
+    }
+    virtual Comm &Clone(void) const = 0;
+    typedef int Copy_attr_function(const Comm& oldcomm, int comm_keyval, void* extra_state, void* attribute_val_in, void* attribute_val_out, bool& flag); 
+    typedef int Delete_attr_function(Comm& comm, int comm_keyval, void* attribute_val, void* extra_state); 
+    typedef void Errhandler_function(Comm &, int *, ... );
+    typedef Errhandler_function Errhandler_fn;
+
+    static int Create_keyval( Copy_attr_function *, Delete_attr_function *,
+                              void * );
+          
+    static int NULL_COPY_FN( const Comm &oldcomm MPIR_ARGUNUSED, 
+           int keyval MPIR_ARGUNUSED, void *ex MPIR_ARGUNUSED,
+           void *attr_in MPIR_ARGUNUSED, void *attr_out MPIR_ARGUNUSED, 
+	   bool &flag ) { flag = 0; return 0;}
+    static int NULL_DELETE_FN( Comm &comm MPIR_ARGUNUSED, 
+	   int keyval MPIR_ARGUNUSED, void * attr MPIR_ARGUNUSED, 
+	   void *ex MPIR_ARGUNUSED ) { return 0; }
+    static int DUP_FN( const Comm &oldcomm MPIR_ARGUNUSED, 
+           int keyval MPIR_ARGUNUSED, void *ex MPIR_ARGUNUSED,
+           void *attr_in, void *attr_out, bool &flag ) { flag = 1; 
+                    *(void **)attr_out = attr_in; return 0;} 
+    static Errhandler Create_errhandler( Errhandler_function * );
+
+    virtual void Call_errhandler( int v2 ) const;
+    virtual void Reduce_scatter_block( const void * v1, void * v2, int v3, const Datatype &v4, const Op &v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Reduce_scatter_block( (const void *)v1, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Op)(v5.the_real_op), (MPI_Comm) the_real_comm ));
+    }
+};
+
+class Nullcomm : public Comm {
+
+  public:
+    // new/delete
+
+    inline Nullcomm(MPI_Comm obj) : Comm(obj) {}
+    inline Nullcomm(void) : Comm() {}
+
+    virtual ~Nullcomm() {}
+    // copy/assignment
+
+    Nullcomm(const Nullcomm &obj) : Comm(obj) {}
+
+    Nullcomm& operator=(const Nullcomm &obj) {
+      the_real_comm = obj.the_real_comm; return *this; }
+
+    // C/C++ cast and assignment
+    inline operator MPI_Comm*() { return &the_real_comm; }
+    inline operator MPI_Comm() const { return the_real_comm; }
+    Nullcomm& operator=(const MPI_Comm& obj) {
+      the_real_comm = obj; return *this; }
+
+// If the compiler does not support variable return types, return a 
+// reference to Comm.  The user must then cast this to the correct type
+// (Standard-conforming C++ compilers support variable return types)
+#ifdef HAVE_NO_VARIABLE_RETURN_TYPE_SUPPORT
+    virtual Comm & Clone(void) const { 
+        Comm *clone = new Nullcomm(MPI_COMM_NULL); 
+        return *clone; 
+    };
+#else
+    virtual Nullcomm & Clone(void) const { 
+        Nullcomm *clone = new Nullcomm();
+        return *clone; 
+    };
+#endif
+};
+
+class Intercomm : public Comm {
+    friend class Intracomm;
+
+  public:
+    // new/delete
+
+    inline Intercomm(MPI_Comm obj) : Comm(obj) {}
+    inline Intercomm(void) : Comm() {}
+
+    virtual ~Intercomm() {}
+    // copy/assignment
+
+    Intercomm(const Intercomm &obj) : Comm(obj) {}
+
+    Intercomm& operator=(const Intercomm &obj) {
+      the_real_comm = obj.the_real_comm; return *this; }
+
+    // C/C++ cast and assignment
+    inline operator MPI_Comm*() { return &the_real_comm; }
+    inline operator MPI_Comm() const { return the_real_comm; }
+    Intercomm& operator=(const MPI_Comm& obj) {
+      the_real_comm = obj; return *this; }
+    virtual Intracomm Merge( bool v2 ) const;
+    virtual Group Get_remote_group( void ) const
+    {
+        Group v2;
+        MPIX_CALLREF( this, MPI_Comm_remote_group( (MPI_Comm) the_real_comm, &(v2.the_real_group) ));
+        return v2;
+    }
+    virtual int Get_remote_size( void ) const
+    {
+        int v2;
+        MPIX_CALLREF( this, MPI_Comm_remote_size( (MPI_Comm) the_real_comm, &v2 ));
+        return v2;
+    }
+    Intercomm Dup( void ) const
+    {
+        Intercomm v2;
+        MPIX_CALLREF( this, MPI_Comm_dup( (MPI_Comm) the_real_comm, &(v2.the_real_comm) ));
+        return v2;
+    }
+    virtual Intercomm Split( int v2, int v3 ) const
+    {
+        Intercomm v4;
+        MPIX_CALLREF( this, MPI_Comm_split( (MPI_Comm) the_real_comm, v2, v3, &(v4.the_real_comm) ));
+        return v4;
+    }
+    
+// If the compiler does not support variable return types, return a 
+// reference to Comm.  The user must then cast this to the correct type
+// (Standard-conforming C++ compilers support variable return types)
+#ifdef HAVE_NO_VARIABLE_RETURN_TYPE_SUPPORT
+    virtual Comm & Clone(void) const { 
+        MPI_Comm ncomm;
+        MPI_Comm_dup( (MPI_Comm)the_real_comm, &ncomm); 
+        Comm *clone = new Intercomm(ncomm); 
+        return *clone; 
+    };
+#else
+    virtual Intercomm & Clone(void) const { 
+        MPI_Comm ncomm;
+        MPI_Comm_dup( (MPI_Comm)the_real_comm, &ncomm); 
+        Intercomm *clone = new Intercomm(ncomm); 
+        return *clone; 
+    };
+#endif
+};
+
+class Intracomm : public Comm {
+    friend class Cartcomm;
+    friend class Graphcomm;
+    friend class Datatype;
+
+  public:
+    // new/delete
+
+    inline Intracomm(MPI_Comm obj) : Comm(obj) {}
+    inline Intracomm(void) : Comm() {}
+
+    virtual ~Intracomm() {}
+    // copy/assignment
+
+    Intracomm(const Intracomm &obj) : Comm(obj) {}
+
+    Intracomm& operator=(const Intracomm &obj) {
+      the_real_comm = obj.the_real_comm; return *this; }
+
+    // C/C++ cast and assignment
+    inline operator MPI_Comm*() { return &the_real_comm; }
+    inline operator MPI_Comm() const { return the_real_comm; }
+    Intracomm& operator=(const MPI_Comm& obj) {
+      the_real_comm = obj; return *this; }
+    virtual Intercomm Create_intercomm( int v2, const Comm &v3, int v4, int v5 ) const
+    {
+        Intercomm v6;
+        MPIX_CALLREF( this, MPI_Intercomm_create( (MPI_Comm) the_real_comm, v2, (MPI_Comm)(v3.the_real_comm), v4, v5, &(v6.the_real_comm) ));
+        return v6;
+    }
+    virtual Intracomm Split( int v2, int v3 ) const
+    {
+        Intracomm v4;
+        MPIX_CALLREF( this, MPI_Comm_split( (MPI_Comm) the_real_comm, v2, v3, &(v4.the_real_comm) ));
+        return v4;
+    }
+    virtual Graphcomm Create_graph( int v2, const int v3[], const int v4[], bool v5 ) const;
+    virtual Cartcomm Create_cart( int v2, const int v3[], const bool v4[], bool v5 ) const;
+    virtual Intracomm Create( const Group &v2 ) const
+    {
+        Intracomm v3;
+        MPIX_CALLREF( this, MPI_Comm_create( (MPI_Comm) the_real_comm, (MPI_Group)(v2.the_real_group), &(v3.the_real_comm) ));
+        return v3;
+    }
+    Intracomm Dup( void ) const
+    {
+        Intracomm v2;
+        MPIX_CALLREF( this, MPI_Comm_dup( (MPI_Comm) the_real_comm, &(v2.the_real_comm) ));
+        return v2;
+    }
+    virtual void Scan( const void * v1, void * v2, int v3, const Datatype &v4, const Op &v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Scan( (const void *)v1, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Op)(v5.the_real_op), (MPI_Comm) the_real_comm ));
+    }
+    virtual void Exscan( const void * v1, void * v2, int v3, const Datatype &v4, const Op &v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Exscan( (const void *)v1, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Op)(v5.the_real_op), (MPI_Comm) the_real_comm ));
+    }
+    virtual Intercomm Accept( const char * v1, const Info &v2, int v3 ) const
+    {
+        Intercomm v5;
+        MPIX_CALLREF( this, MPI_Comm_accept( (const char *)v1, (MPI_Info)(v2.the_real_info), v3, (MPI_Comm) the_real_comm, &(v5.the_real_comm) ));
+        return v5;
+    }
+    virtual Intercomm Connect( const char * v1, const Info &v2, int v3 ) const
+    {
+        Intercomm v5;
+        MPIX_CALLREF( this, MPI_Comm_connect( (const char *)v1, (MPI_Info)(v2.the_real_info), v3, (MPI_Comm) the_real_comm, &(v5.the_real_comm) ));
+        return v5;
+    }
+    
+// If the compiler does not support variable return types, return a 
+// reference to Comm.  The user must then cast this to the correct type
+// (Standard-conforming C++ compilers support variable return types)
+#ifdef HAVE_NO_VARIABLE_RETURN_TYPE_SUPPORT
+    virtual Comm & Clone(void) const { 
+        MPI_Comm ncomm;
+        MPI_Comm_dup( (MPI_Comm)the_real_comm, &ncomm); 
+        Comm *clone = new Intracomm(ncomm); 
+        return *clone; 
+    };
+#else
+    virtual Intracomm & Clone(void) const { 
+        MPI_Comm ncomm;
+        MPI_Comm_dup( (MPI_Comm)the_real_comm, &ncomm); 
+        Intracomm *clone = new Intracomm(ncomm); 
+        return *clone; 
+    };
+#endif
+
+Intercomm Spawn(const char* command, const char* argv[], int maxprocs, const MPI::Info& info, int root) const {
+    Intercomm ic;
+    MPIX_CALLREF( this, MPI_Comm_spawn( (char *)command, 
+                (char **)argv, 
+                maxprocs, info.the_real_info, root, the_real_comm, 
+                &(ic.the_real_comm), MPI_ERRCODES_IGNORE ) );
+    return ic;
+}
+Intercomm Spawn(const char* command, const char* argv[], int maxprocs, const MPI::Info& info, int root, int array_of_errcodes[]) const {
+    Intercomm ic;
+    MPIX_CALLREF( this, MPI_Comm_spawn( (char *)command, 
+                (char **)argv, 
+                maxprocs, info.the_real_info, root, the_real_comm, 
+                &(ic.the_real_comm), array_of_errcodes ) );
+    return ic;
+}
+Intercomm Spawn_multiple(int count, const char* array_of_commands[], const char** array_of_argv[], const int array_of_maxprocs[], const MPI::Info array_of_info[], int root) {
+    Intercomm ic;
+    MPI_Info  *li = new MPI_Info [count];
+    int i;
+    for (i=0; i<count; i++) {
+        li[i] = array_of_info[i].the_real_info;
+    }
+    MPIX_CALLREF( this, MPI_Comm_spawn_multiple( count, 
+                   (char **)array_of_commands, 
+                   (char ***)array_of_argv, (int *)array_of_maxprocs, 
+                   li, root, the_real_comm, &(ic.the_real_comm), 
+                   MPI_ERRCODES_IGNORE ) );
+    delete [] li;
+    return ic;
+}
+Intercomm Spawn_multiple(int count, const char* array_of_commands[], const char** array_of_argv[], const int array_of_maxprocs[], const MPI::Info array_of_info[], int root, int array_of_errcodes[]) {
+    Intercomm ic;
+    MPI_Info  *li = new MPI_Info [count];
+    int i;
+    for (i=0; i<count; i++) {
+        li[i] = array_of_info[i].the_real_info;
+    }
+    MPIX_CALLREF( this, MPI_Comm_spawn_multiple( count, 
+                   (char **)array_of_commands, 
+                   (char ***)array_of_argv, (int *)array_of_maxprocs, 
+                   li, root, the_real_comm, &(ic.the_real_comm), 
+                   array_of_errcodes ) );
+    delete [] li;
+    return ic;
+}
+
+};
+
+class Grequest : public Request {
+
+  public:
+    // new/delete
+
+    inline Grequest(MPI_Request obj) : Request(obj) {}
+    inline Grequest(void) : Request() {}
+
+    virtual ~Grequest() {}
+    // copy/assignment
+
+    Grequest(const Grequest &obj) : Request(obj) {}
+
+    Grequest& operator=(const Grequest &obj) {
+      the_real_request = obj.the_real_request; return *this; }
+
+    // logical
+    bool operator== (const Grequest &obj) {
+      return (the_real_request == obj.the_real_request); }
+    bool operator!= (const Grequest &obj) {
+      return (the_real_request != obj.the_real_request); }
+    // C/C++ cast and assignment
+    inline operator MPI_Request*() { return &the_real_request; }
+    inline operator MPI_Request() const { return the_real_request; }
+    Grequest& operator=(const MPI_Request& obj) {
+      the_real_request = obj; return *this; }
+    virtual void Complete( void ) 
+    {
+        MPIX_CALLWORLD( MPI_Grequest_complete( (MPI_Request) the_real_request ));
+    }
+
+    typedef int Query_function( void *, Status & );
+    typedef int Free_function( void * );
+    typedef int Cancel_function( void *, bool );
+
+    Grequest Start( Query_function  *query_fn,
+                    Free_function   *free_fn,
+                    Cancel_function *cancel_fn,
+                    void *extra_state );
+};
+
+class Win  {
+
+  protected:
+    MPI_Win the_real_win;
+
+  public:
+    // new/delete
+
+    inline Win(MPI_Win obj) : the_real_win(obj) {}
+    inline Win(void) : the_real_win(MPI_WIN_NULL) {}
+
+    virtual ~Win() {}
+    // copy/assignment
+
+    Win(const Win &obj) : the_real_win(obj.the_real_win){}
+
+    Win& operator=(const Win &obj) {
+      the_real_win = obj.the_real_win; return *this; }
+
+    // logical
+    bool operator== (const Win &obj) {
+      return (the_real_win == obj.the_real_win); }
+    bool operator!= (const Win &obj) {
+      return (the_real_win != obj.the_real_win); }
+    // C/C++ cast and assignment
+    inline operator MPI_Win*() { return &the_real_win; }
+    inline operator MPI_Win() const { return the_real_win; }
+    Win& operator=(const MPI_Win& obj) {
+      the_real_win = obj; return *this; }
+    virtual Group Get_group( void ) const
+    {
+        Group v2;
+        MPIX_CALLREF( this, MPI_Win_get_group( (MPI_Win) the_real_win, &(v2.the_real_group) ));
+        return v2;
+    }
+    virtual void Fence( int v1 ) const
+    {
+        MPIX_CALLREF( this, MPI_Win_fence( v1, (MPI_Win) the_real_win ));
+    }
+    virtual void Start( const Group &v1, int v2 ) const
+    {
+        MPIX_CALLREF( this, MPI_Win_start( (MPI_Group)(v1.the_real_group), v2, (MPI_Win) the_real_win ));
+    }
+    virtual void Free( void ) 
+    {
+        MPIX_CALLREF( this, MPI_Win_free( (MPI_Win *) &the_real_win ));
+    }
+    virtual void Put( const void * v1, int v2, const Datatype &v3, int v4, Aint v5, int v6, const Datatype &v7 ) const
+    {
+        MPIX_CALLREF( this, MPI_Put( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, v6, (MPI_Datatype)(v7.the_real_datatype), (MPI_Win) the_real_win ));
+    }
+    virtual void Wait( void ) const
+    {
+        MPIX_CALLREF( this, MPI_Win_wait( (MPI_Win) the_real_win ));
+    }
+    virtual bool Test( void ) const
+    {
+        int v2;
+        MPIX_CALLREF( this, MPI_Win_test( (MPI_Win) the_real_win, &v2 ));
+        return v2!= 0;
+    }
+    virtual void Get( void * v1, int v2, const Datatype &v3, int v4, Aint v5, int v6, const Datatype &v7 ) const
+    {
+        MPIX_CALLREF( this, MPI_Get( v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, v6, (MPI_Datatype)(v7.the_real_datatype), (MPI_Win) the_real_win ));
+    }
+    virtual bool Get_attr( int v2, void * v3 ) const
+    {
+        int v4;
+        MPIX_CALLREF( this, MPI_Win_get_attr( (MPI_Win) the_real_win, v2, v3, &v4 ));
+        return v4!= 0;
+    }
+    virtual void Set_attr( int v2, const void * v3 ) 
+    {
+        MPIX_CALLREF( this, MPI_Win_set_attr( (MPI_Win) the_real_win, v2, (void *)v3 ));
+    }
+    virtual void Complete( void ) const
+    {
+        MPIX_CALLREF( this, MPI_Win_complete( (MPI_Win) the_real_win ));
+    }
+    virtual void Set_errhandler( const Errhandler &v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_Win_set_errhandler( (MPI_Win) the_real_win, (MPI_Errhandler)(v2.the_real_errhandler) ));
+    }
+    virtual void Set_name( const char * v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_Win_set_name( (MPI_Win) the_real_win, (const char *)v2 ));
+    }
+    virtual void Accumulate( const void * v1, int v2, const Datatype &v3, int v4, Aint v5, int v6, const Datatype &v7, const Op &v8 ) const
+    {
+        MPIX_CALLREF( this, MPI_Accumulate( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, v6, (MPI_Datatype)(v7.the_real_datatype), (MPI_Op)(v8.the_real_op), (MPI_Win) the_real_win ));
+    }
+    static Win Create( const void * v1, Aint v2, int v3, const Info &v4, const Intracomm &v5 ) 
+    {
+        Win v6;
+        MPIX_CALLOBJ( v5, MPI_Win_create( (void *)v1, v2, v3, (MPI_Info)(v4.the_real_info), (MPI_Comm)(v5.the_real_comm), &(v6.the_real_win) ));
+        return v6;
+    }
+    static void Free_keyval( int &v1 ) 
+    {
+        MPIX_CALLWORLD( MPI_Win_free_keyval( &v1 ));
+    }
+    virtual void Post( const Group &v1, int v2 ) const
+    {
+        MPIX_CALLREF( this, MPI_Win_post( (MPI_Group)(v1.the_real_group), v2, (MPI_Win) the_real_win ));
+    }
+    virtual void Unlock( int v1 ) const
+    {
+        MPIX_CALLREF( this, MPI_Win_unlock( v1, (MPI_Win) the_real_win ));
+    }
+    virtual void Delete_attr( int v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_Win_delete_attr( (MPI_Win) the_real_win, v2 ));
+    }
+    virtual void Lock( int v1, int v2, int v3 ) const
+    {
+        MPIX_CALLREF( this, MPI_Win_lock( v1, v2, v3, (MPI_Win) the_real_win ));
+    }
+    virtual Errhandler Get_errhandler( void ) const
+    {
+        Errhandler v2;
+        MPIX_CALLREF( this, MPI_Win_get_errhandler( (MPI_Win) the_real_win, &(v2.the_real_errhandler) ));
+        return v2;
+    }
+    virtual void Get_name( char * v2, int &v3 ) const
+    {
+        MPIX_CALLREF( this, MPI_Win_get_name( (MPI_Win) the_real_win, v2, &v3 ));
+    }
+    typedef void Errhandler_function(Win &, int *, ... );
+    typedef Errhandler_function Errhandler_fn;
+
+    static Errhandler Create_errhandler( Errhandler_function * );
+
+    typedef int Copy_attr_function(const Win& oldwin, int win_keyval, void* extra_state, void* attribute_val_in, void* attribute_val_out, bool& flag); 
+    typedef int Delete_attr_function(Win& win, int win_keyval, void* attribute_val, void* extra_state); 
+
+    static int Create_keyval( Copy_attr_function *, Delete_attr_function *,
+                              void * );
+    // These functions are *not* part of MPI-2 but are provided
+    // because they should have been included
+    static int NULL_COPY_FN( const Win &oldwin MPIR_ARGUNUSED, 
+        int keyval MPIR_ARGUNUSED, void *ex MPIR_ARGUNUSED,
+        void *attr_in MPIR_ARGUNUSED, void *attr_out MPIR_ARGUNUSED, 
+        bool &flag ) { flag = 1; return 0;}
+    static int NULL_DELETE_FN( Win &win MPIR_ARGUNUSED, 
+        int keyval MPIR_ARGUNUSED, void * attr MPIR_ARGUNUSED, 
+        void *ex MPIR_ARGUNUSED ) { return 0; }
+    static int DUP_FN( const Win &oldwin MPIR_ARGUNUSED, 
+	int keyval MPIR_ARGUNUSED, void *ex MPIR_ARGUNUSED,
+        void *attr_in, void *attr_out, bool &flag ) { flag = 1; 
+            *(void **)attr_out = attr_in; return 0;} 
+
+    virtual void Call_errhandler( int v2 ) const;
+};
+#ifndef MPI_FILE_NULL
+#define MPI_FILE_NULL 0
+typedef int MPI_File;
+#endif
+
+class File  {
+
+  protected:
+    MPI_File the_real_file;
+
+  public:
+    // new/delete
+
+    inline File(MPI_File obj) : the_real_file(obj) {}
+    inline File(void) : the_real_file(MPI_FILE_NULL) {}
+
+    virtual ~File() {}
+    // copy/assignment
+
+    File(const File &obj) : the_real_file(obj.the_real_file){}
+
+    File& operator=(const File &obj) {
+      the_real_file = obj.the_real_file; return *this; }
+
+    // logical
+    bool operator== (const File &obj) {
+      return (the_real_file == obj.the_real_file); }
+    bool operator!= (const File &obj) {
+      return (the_real_file != obj.the_real_file); }
+    // C/C++ cast and assignment
+    inline operator MPI_File*() { return &the_real_file; }
+    inline operator MPI_File() const { return the_real_file; }
+    File& operator=(const MPI_File& obj) {
+      the_real_file = obj; return *this; }
+#ifdef MPI_MODE_RDONLY
+    virtual Aint Get_type_extent( const Datatype &v2 ) const
+    {
+        MPI_Aint v3;
+        MPIX_CALLREF( this, MPI_File_get_type_extent( (MPI_File) the_real_file, (MPI_Datatype)(v2.the_real_datatype), &v3 ));
+        return v3;
+    }
+    virtual void Read_ordered_end( void * v2, Status & v3 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_ordered_end( (MPI_File) the_real_file, v2, (MPI_Status *)&(v3.the_real_status ) ));
+    }
+    virtual void Read_ordered_end( void * v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_ordered_end( (MPI_File) the_real_file, v2, MPI_STATUS_IGNORE ));
+    }
+    virtual void Seek_shared( Offset v2, int v3 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_seek_shared( (MPI_File) the_real_file, v2, v3 ));
+    }
+    virtual void Read_ordered( void * v2, int v3, const Datatype &v4, Status & v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_ordered( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Status *)&(v5.the_real_status ) ));
+    }
+    virtual void Read_ordered( void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_ordered( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual Request Iread_shared( void * v2, int v3, const Datatype &v4 ) 
+    {
+        Request v5;
+        MPIX_CALLREF( this, MPI_File_iread_shared( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPIO_Request *)&(v5.the_real_request) ));
+        return v5;
+    }
+    virtual Info Get_info( void ) const
+    {
+        Info v2;
+        MPIX_CALLREF( this, MPI_File_get_info( (MPI_File) the_real_file, &(v2.the_real_info) ));
+        return v2;
+    }
+    virtual void Write_ordered_begin( const void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_ordered_begin( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype) ));
+    }
+    virtual void Set_info( const Info &v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_set_info( (MPI_File) the_real_file, (MPI_Info)(v2.the_real_info) ));
+    }
+    virtual void Write_ordered( const void * v2, int v3, const Datatype &v4, Status & v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_ordered( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Status *)&(v5.the_real_status ) ));
+    }
+    virtual void Write_ordered( const void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_ordered( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual void Sync( void ) 
+    {
+        MPIX_CALLREF( this, MPI_File_sync( (MPI_File) the_real_file ));
+    }
+    virtual void Read( void * v2, int v3, const Datatype &v4, Status & v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Status *)&(v5.the_real_status ) ));
+    }
+    virtual void Read( void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual void Write_all( const void * v2, int v3, const Datatype &v4, Status & v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_all( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Status *)&(v5.the_real_status ) ));
+    }
+    virtual void Write_all( const void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_all( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual Offset Get_size( void ) const
+    {
+        MPI_Offset v2;
+        MPIX_CALLREF( this, MPI_File_get_size( (MPI_File) the_real_file, &v2 ));
+        return v2;
+    }
+    virtual void Write_all_end( const void * v2, Status & v3 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_all_end( (MPI_File) the_real_file, (const void *)v2, (MPI_Status *)&(v3.the_real_status ) ));
+    }
+    virtual void Write_all_end( const void * v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_all_end( (MPI_File) the_real_file, (const void *)v2, MPI_STATUS_IGNORE ));
+    }
+    static void Delete( const char * v1, const Info &v2 ) 
+    {
+        MPIX_CALLOBJ( FILE_NULL, MPI_File_delete( (const char *)v1, (MPI_Info)(v2.the_real_info) ));
+    }
+    virtual void Read_ordered_begin( void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_ordered_begin( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype) ));
+    }
+    virtual Request Iread_at( Offset v2, void * v3, int v4, const Datatype &v5 ) 
+    {
+        Request v6;
+        MPIX_CALLREF( this, MPI_File_iread_at( (MPI_File) the_real_file, v2, v3, v4, (MPI_Datatype)(v5.the_real_datatype), (MPIO_Request *)&(v6.the_real_request) ));
+        return v6;
+    }
+    virtual void Write_at_all_end( const void * v2, Status & v3 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_at_all_end( (MPI_File) the_real_file, (const void *)v2, (MPI_Status *)&(v3.the_real_status ) ));
+    }
+    virtual void Write_at_all_end( const void * v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_at_all_end( (MPI_File) the_real_file, (const void *)v2, MPI_STATUS_IGNORE ));
+    }
+    virtual Offset Get_position_shared( void ) const
+    {
+        MPI_Offset v2;
+        MPIX_CALLREF( this, MPI_File_get_position_shared( (MPI_File) the_real_file, &v2 ));
+        return v2;
+    }
+    virtual void Write_shared( const void * v2, int v3, const Datatype &v4, Status & v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_shared( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Status *)&(v5.the_real_status ) ));
+    }
+    virtual void Write_shared( const void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_shared( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual Request Iwrite_at( Offset v2, const void * v3, int v4, const Datatype &v5 ) 
+    {
+        Request v6;
+        MPIX_CALLREF( this, MPI_File_iwrite_at( (MPI_File) the_real_file, (MPI_Offset)v2, (const void *)v3, v4, (MPI_Datatype)(v5.the_real_datatype), (MPIO_Request *)&(v6.the_real_request) ));
+        return v6;
+    }
+    virtual void Get_view( Offset & v2, Datatype &v3, Datatype &v4, char * v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_File_get_view( (MPI_File) the_real_file, &v2, (MPI_Datatype *)&(v3.the_real_datatype), (MPI_Datatype *)&(v4.the_real_datatype), v5 ));
+    }
+    virtual void Write_all_begin( const void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_all_begin( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype) ));
+    }
+    virtual void Read_all_end( void * v2, Status & v3 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_all_end( (MPI_File) the_real_file, v2, (MPI_Status *)&(v3.the_real_status ) ));
+    }
+    virtual void Read_all_end( void * v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_all_end( (MPI_File) the_real_file, v2, MPI_STATUS_IGNORE ));
+    }
+    virtual Offset Get_byte_offset( const Offset v2 ) const
+    {
+        MPI_Offset v3;
+        MPIX_CALLREF( this, MPI_File_get_byte_offset( (MPI_File) the_real_file, (MPI_Offset)v2, &v3 ));
+        return v3;
+    }
+    virtual Request Iread( void * v2, int v3, const Datatype &v4 ) 
+    {
+        Request v5;
+        MPIX_CALLREF( this, MPI_File_iread( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPIO_Request *)&(v5.the_real_request) ));
+        return v5;
+    }
+    virtual void Read_at_all_end( void * v2, Status & v3 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_at_all_end( (MPI_File) the_real_file, v2, (MPI_Status *)&(v3.the_real_status ) ));
+    }
+    virtual void Read_at_all_end( void * v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_at_all_end( (MPI_File) the_real_file, v2, MPI_STATUS_IGNORE ));
+    }
+    virtual void Write_at( Offset v2, const void * v3, int v4, const Datatype &v5, Status & v6 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_at( (MPI_File) the_real_file, v2, (const void *)v3, v4, (MPI_Datatype)(v5.the_real_datatype), (MPI_Status *)&(v6.the_real_status ) ));
+    }
+    virtual void Write_at( Offset v2, const void * v3, int v4, const Datatype &v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_at( (MPI_File) the_real_file, v2, (const void *)v3, v4, (MPI_Datatype)(v5.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual void Write_at_all_begin( Offset v2, const void * v3, int v4, const Datatype &v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_at_all_begin( (MPI_File) the_real_file, v2, (const void *)v3, v4, (MPI_Datatype)(v5.the_real_datatype) ));
+    }
+    virtual Errhandler Get_errhandler( void ) const
+    {
+        Errhandler v2;
+        MPIX_CALLREF( this, MPI_File_get_errhandler( (MPI_File) the_real_file, &(v2.the_real_errhandler) ));
+        return v2;
+    }
+    virtual int Get_amode( void ) const
+    {
+        int v2;
+        MPIX_CALLREF( this, MPI_File_get_amode( (MPI_File) the_real_file, &v2 ));
+        return v2;
+    }
+    virtual void Set_atomicity( bool v2 ) 
+    {
+        int l2;
+         l2 = (v2 == true) ? 1 : 0;
+        MPIX_CALLREF( this, MPI_File_set_atomicity( (MPI_File) the_real_file, l2 ));
+    }
+    virtual Group Get_group( void ) const
+    {
+        Group v2;
+        MPIX_CALLREF( this, MPI_File_get_group( (MPI_File) the_real_file, &(v2.the_real_group) ));
+        return v2;
+    }
+    virtual Offset Get_position( void ) const
+    {
+        MPI_Offset v2;
+        MPIX_CALLREF( this, MPI_File_get_position( (MPI_File) the_real_file, &v2 ));
+        return v2;
+    }
+    static File Open( const Intracomm &v1, const char * v2, int v3, const Info &v4 ) 
+    {
+        File v5;
+        MPIX_CALLOBJ( FILE_NULL, MPI_File_open( (MPI_Comm)(v1.the_real_comm), (const char *)v2, v3, (MPI_Info)(v4.the_real_info), &(v5.the_real_file) ));
+        return v5;
+    }
+    virtual void Seek( Offset v2, int v3 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_seek( (MPI_File) the_real_file, v2, v3 ));
+    }
+    virtual void Read_all_begin( void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_all_begin( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype) ));
+    }
+    virtual void Read_at_all_begin( Offset v2, void * v3, int v4, const Datatype &v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_at_all_begin( (MPI_File) the_real_file, v2, v3, v4, (MPI_Datatype)(v5.the_real_datatype) ));
+    }
+    virtual void Read_all( void * v2, int v3, const Datatype &v4, Status & v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_all( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Status *)&(v5.the_real_status ) ));
+    }
+    virtual void Read_all( void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_all( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual void Preallocate( Offset v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_preallocate( (MPI_File) the_real_file, v2 ));
+    }
+    virtual void Read_at_all( Offset v2, void * v3, int v4, const Datatype &v5, Status & v6 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_at_all( (MPI_File) the_real_file, v2, v3, v4, (MPI_Datatype)(v5.the_real_datatype), (MPI_Status *)&(v6.the_real_status ) ));
+    }
+    virtual void Read_at_all( Offset v2, void * v3, int v4, const Datatype &v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_at_all( (MPI_File) the_real_file, v2, v3, v4, (MPI_Datatype)(v5.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual void Read_shared( void * v2, int v3, const Datatype &v4, Status & v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_shared( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Status *)&(v5.the_real_status ) ));
+    }
+    virtual void Read_shared( void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_shared( (MPI_File) the_real_file, v2, v3, (MPI_Datatype)(v4.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual Request Iwrite( const void * v2, int v3, const Datatype &v4 ) 
+    {
+        Request v5;
+        MPIX_CALLREF( this, MPI_File_iwrite( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPIO_Request *)&(v5.the_real_request) ));
+        return v5;
+    }
+    virtual Request Iwrite_shared( const void * v2, int v3, const Datatype &v4 ) 
+    {
+        Request v5;
+        MPIX_CALLREF( this, MPI_File_iwrite_shared( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPIO_Request *)&(v5.the_real_request) ));
+        return v5;
+    }
+    virtual void Set_errhandler( const Errhandler &v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_set_errhandler( (MPI_File) the_real_file, (MPI_Errhandler)(v2.the_real_errhandler) ));
+    }
+    virtual void Write_at_all( Offset v2, const void * v3, int v4, const Datatype &v5, Status & v6 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_at_all( (MPI_File) the_real_file, v2, (const void *)v3, v4, (MPI_Datatype)(v5.the_real_datatype), (MPI_Status *)&(v6.the_real_status ) ));
+    }
+    virtual void Write_at_all( Offset v2, const void * v3, int v4, const Datatype &v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_at_all( (MPI_File) the_real_file, v2, (const void *)v3, v4, (MPI_Datatype)(v5.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual void Set_size( Offset v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_set_size( (MPI_File) the_real_file, v2 ));
+    }
+    virtual void Set_view( Offset v2, const Datatype &v3, const Datatype v4, const char * v5, const Info &v6 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_set_view( (MPI_File) the_real_file, v2, (MPI_Datatype)(v3.the_real_datatype), (MPI_Datatype)v4, (const char *)v5, (MPI_Info)(v6.the_real_info) ));
+    }
+    virtual void Read_at( Offset v2, void * v3, int v4, const Datatype &v5, Status & v6 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_at( (MPI_File) the_real_file, v2, v3, v4, (MPI_Datatype)(v5.the_real_datatype), (MPI_Status *)&(v6.the_real_status ) ));
+    }
+    virtual void Read_at( Offset v2, void * v3, int v4, const Datatype &v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_read_at( (MPI_File) the_real_file, v2, v3, v4, (MPI_Datatype)(v5.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual void Close( void ) 
+    {
+        MPIX_CALLREF( this, MPI_File_close( (MPI_File *) &the_real_file ));
+    }
+    virtual void Write_ordered_end( const void * v2, Status & v3 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_ordered_end( (MPI_File) the_real_file, (const void *)v2, (MPI_Status *)&(v3.the_real_status ) ));
+    }
+    virtual void Write_ordered_end( const void * v2 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write_ordered_end( (MPI_File) the_real_file, (const void *)v2, MPI_STATUS_IGNORE ));
+    }
+    virtual void Write( const void * v2, int v3, const Datatype &v4, Status & v5 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype), (MPI_Status *)&(v5.the_real_status ) ));
+    }
+    virtual void Write( const void * v2, int v3, const Datatype &v4 ) 
+    {
+        MPIX_CALLREF( this, MPI_File_write( (MPI_File) the_real_file, (const void *)v2, v3, (MPI_Datatype)(v4.the_real_datatype), MPI_STATUS_IGNORE ));
+    }
+    virtual bool Get_atomicity( void ) const
+    {
+        int v2;
+        MPIX_CALLREF( this, MPI_File_get_atomicity( (MPI_File) the_real_file, &v2 ));
+        return v2!= 0;
+    }
+    typedef void Errhandler_function(File &, int *, ... );
+    typedef Errhandler_function Errhandler_fn;
+
+    static Errhandler Create_errhandler( Errhandler_function * );
+
+    virtual void Call_errhandler( int v2 ) const;
+#endif
+};
+
+class Graphcomm : public Intracomm {
+
+  public:
+    // new/delete
+
+    inline Graphcomm(MPI_Comm obj) : Intracomm(obj) {}
+    inline Graphcomm(void) : Intracomm() {}
+
+    virtual ~Graphcomm() {}
+    // copy/assignment
+
+    Graphcomm(const Graphcomm &obj) : Intracomm(obj) {}
+
+    Graphcomm& operator=(const Graphcomm &obj) {
+      the_real_comm = obj.the_real_comm; return *this; }
+
+    // C/C++ cast and assignment
+    inline operator MPI_Comm*() { return &the_real_comm; }
+    inline operator MPI_Comm() const { return the_real_comm; }
+    Graphcomm& operator=(const MPI_Comm& obj) {
+      the_real_comm = obj; return *this; }
+    virtual void Get_dims( int * v2, int * v3 ) const
+    {
+        MPIX_CALLREF( this, MPI_Graphdims_get( (MPI_Comm) the_real_comm, v2, v3 ));
+    }
+    virtual void Get_topo( int v2, int v3, int v4[], int v5[] ) const
+    {
+        MPIX_CALLREF( this, MPI_Graph_get( (MPI_Comm) the_real_comm, v2, v3, v4, v5 ));
+    }
+    virtual int Map( int v2, const int v3[], const int v4[] ) const
+    {
+        int v5;
+        MPIX_CALLREF( this, MPI_Graph_map( (MPI_Comm) the_real_comm, v2, (const int  *)v3, (const int  *)v4, &v5 ));
+        return v5;
+    }
+    virtual void Get_neighbors( int v2, int v3, int v4[] ) const
+    {
+        MPIX_CALLREF( this, MPI_Graph_neighbors( (MPI_Comm) the_real_comm, v2, v3, v4 ));
+    }
+    virtual int Get_neighbors_count( int v2 ) const
+    {
+        int v3;
+        MPIX_CALLREF( this, MPI_Graph_neighbors_count( (MPI_Comm) the_real_comm, v2, &v3 ));
+        return v3;
+    }
+    Graphcomm Dup( void ) const
+    {
+        Graphcomm v2;
+        MPIX_CALLREF( this, MPI_Comm_dup( (MPI_Comm) the_real_comm, &(v2.the_real_comm) ));
+        return v2;
+    }
+    
+// If the compiler does not support variable return types, return a 
+// reference to Comm.  The user must then cast this to the correct type
+// (Standard-conforming C++ compilers support variable return types)
+#ifdef HAVE_NO_VARIABLE_RETURN_TYPE_SUPPORT
+    virtual Comm & Clone(void) const { 
+        MPI_Comm ncomm;
+        MPI_Comm_dup( (MPI_Comm)the_real_comm, &ncomm); 
+        Comm *clone = new Graphcomm(ncomm); 
+        return *clone; 
+    };
+#else
+    virtual Graphcomm & Clone(void) const { 
+        MPI_Comm ncomm;
+        MPI_Comm_dup( (MPI_Comm)the_real_comm, &ncomm); 
+        Graphcomm *clone = new Graphcomm(ncomm); 
+        return *clone; 
+    };
+#endif
+};
+
+class Cartcomm : public Intracomm {
+
+  public:
+    // new/delete
+
+    inline Cartcomm(MPI_Comm obj) : Intracomm(obj) {}
+    inline Cartcomm(void) : Intracomm() {}
+
+    virtual ~Cartcomm() {}
+    // copy/assignment
+
+    Cartcomm(const Cartcomm &obj) : Intracomm(obj) {}
+
+    Cartcomm& operator=(const Cartcomm &obj) {
+      the_real_comm = obj.the_real_comm; return *this; }
+
+    // C/C++ cast and assignment
+    inline operator MPI_Comm*() { return &the_real_comm; }
+    inline operator MPI_Comm() const { return the_real_comm; }
+    Cartcomm& operator=(const MPI_Comm& obj) {
+      the_real_comm = obj; return *this; }
+    virtual void Get_coords( int v2, int v3, int v4[] ) const
+    {
+        MPIX_CALLREF( this, MPI_Cart_coords( (MPI_Comm) the_real_comm, v2, v3, v4 ));
+    }
+    virtual int Get_cart_rank( const int v2[] ) const
+    {
+        int v3;
+        MPIX_CALLREF( this, MPI_Cart_rank( (MPI_Comm) the_real_comm, (const int  *)v2, &v3 ));
+        return v3;
+    }
+    virtual int Get_dim( void ) const
+    {
+        int v2;
+        MPIX_CALLREF( this, MPI_Cartdim_get( (MPI_Comm) the_real_comm, &v2 ));
+        return v2;
+    }
+    Cartcomm Dup( void ) const
+    {
+        Cartcomm v2;
+        MPIX_CALLREF( this, MPI_Comm_dup( (MPI_Comm) the_real_comm, &(v2.the_real_comm) ));
+        return v2;
+    }
+    virtual void Get_topo( int v2, int v3[], bool v4[], int v5[] ) const
+    {
+        int *l4 = new int[v2];
+        MPIX_CALLREF( this, MPI_Cart_get( (MPI_Comm) the_real_comm, v2, v3, l4, v5 ));
+        { 
+            int i4; 
+            for (i4=0;i4<v2;i4++) {
+		// Unfortunately, at least one C++ compiler (Microsoft's)
+		// generates warning messages when the type size changes
+		// even when an explicit cast is used.  To avoid these messages, we 
+		// cause the generated code to explicitly compute a
+		// boolean value
+                v4[i4] = l4[i4] != 0;
+            }
+            delete[] l4;
+        }
+    }
+    virtual int Map( int v2, const int v3[], const bool v4[] ) const
+    {
+        int v5;
+        int *l4 = new int[v2];
+        { 
+            int i4; 
+            for (i4=0;i4<v2;i4++) {
+                l4[i4] = v4[i4] == true ? 1 : 0;
+            }
+        }
+        MPIX_CALLREF( this, MPI_Cart_map( (MPI_Comm) the_real_comm, v2, (const int  *)v3, l4, &v5 ));
+
+            delete[] l4;
+        return v5;
+    }
+    virtual Cartcomm Sub( const bool v2[] ) const
+    {
+        Cartcomm v3;
+        int *l2 = new int[10];
+        { 
+            int i2; 
+            for (i2=0;i2<10;i2++) {
+                l2[i2] = v2[i2] == true ? 1 : 0;
+            }
+        }
+        MPIX_CALLREF( this, MPI_Cart_sub( (MPI_Comm) the_real_comm, l2, &(v3.the_real_comm) ));
+
+            delete[] l2;
+        return v3;
+    }
+    virtual void Shift( int v2, int v3, int &v4, int &v5 ) const
+    {
+        MPIX_CALLREF( this, MPI_Cart_shift( (MPI_Comm) the_real_comm, v2, v3, &v4, &v5 ));
+    }
+    
+// If the compiler does not support variable return types, return a 
+// reference to Comm.  The user must then cast this to the correct type
+// (Standard-conforming C++ compilers support variable return types)
+#ifdef HAVE_NO_VARIABLE_RETURN_TYPE_SUPPORT
+    virtual Comm & Clone(void) const { 
+        MPI_Comm ncomm;
+        MPI_Comm_dup( (MPI_Comm)the_real_comm, &ncomm); 
+        Comm *clone = new Cartcomm(ncomm); 
+        return *clone; 
+    };
+#else
+    virtual Cartcomm & Clone(void) const { 
+        MPI_Comm ncomm;
+        MPI_Comm_dup( (MPI_Comm)the_real_comm, &ncomm); 
+        Cartcomm *clone = new Cartcomm(ncomm); 
+        return *clone; 
+    };
+#endif
+};
+extern     int Add_error_class( void ) ;
+extern     void* Alloc_mem( Aint v1, const Info &v2 ) ;
+extern     void Lookup_name( const char * v1, const Info &v2, char * v3 ) ;
+extern     void Publish_name( const char * v1, const Info &v2, const char * v3 ) ;
+extern     void Unpublish_name( const char * v1, const Info &v2, const char * v3 ) ;
+extern     Aint Get_address( const void * v1 ) ;
+extern     void Add_error_string( int v1, const char * v2 ) ;
+extern     int Query_thread( void ) ;
+extern     void Close_port( const char * v1 ) ;
+extern     int Add_error_code( int v1 ) ;
+extern     void Free_mem( void * v1 ) ;
+extern     void Open_port( const Info &v1, char * v2 ) ;
+extern     bool Is_finalized( void ) ;
+extern     bool Is_thread_main( void ) ;
+
+#ifdef MPI_MODE_RDONLY
+typedef int Datarep_extent_function( const Datatype&, Aint&, void *);
+typedef int Datarep_conversion_function( void *, Datatype &, int, void *,
+                Offset, void * );
+#endif
+
+
+extern Datatype CHAR;
+extern Datatype UNSIGNED_CHAR;
+extern Datatype BYTE;
+extern Datatype SHORT;
+extern Datatype UNSIGNED_SHORT;
+extern Datatype INT;
+extern Datatype UNSIGNED;
+extern Datatype LONG;
+extern Datatype UNSIGNED_LONG;
+extern Datatype FLOAT;
+extern Datatype DOUBLE;
+extern Datatype LONG_DOUBLE;
+extern Datatype LONG_LONG_INT;
+extern Datatype LONG_LONG;
+extern Datatype PACKED;
+extern Datatype LB;
+extern Datatype UB;
+extern Datatype FLOAT_INT;
+extern Datatype DOUBLE_INT;
+extern Datatype LONG_INT;
+extern Datatype SHORT_INT;
+extern Datatype LONG_DOUBLE_INT;
+extern Datatype REAL4;
+extern Datatype REAL8;
+extern Datatype REAL16;
+extern Datatype COMPLEX8;
+extern Datatype COMPLEX16;
+extern Datatype COMPLEX32;
+extern Datatype INTEGER1;
+extern Datatype INTEGER2;
+extern Datatype INTEGER4;
+extern Datatype INTEGER8;
+extern Datatype INTEGER16;
+extern Datatype WCHAR;
+extern Datatype SIGNED_CHAR;
+extern Datatype UNSIGNED_LONG_LONG;
+extern Datatype TWOINT;
+extern Datatype BOOL;
+#define MPIR_CXX_BOOL 0x4c000133
+extern Datatype COMPLEX;
+#define MPIR_CXX_COMPLEX 0x4c000834
+extern Datatype DOUBLE_COMPLEX;
+#define MPIR_CXX_DOUBLE_COMPLEX 0x4c001035
+extern Datatype LONG_DOUBLE_COMPLEX;
+#define MPIR_CXX_LONG_DOUBLE_COMPLEX 0x4c002036
+extern Datatype DATATYPE_NULL;
+
+#if 1
+extern Datatype INTEGER;
+extern Datatype REAL;
+extern Datatype DOUBLE_PRECISION;
+extern Datatype F_COMPLEX;
+extern Datatype F_DOUBLE_COMPLEX;
+extern Datatype LOGICAL;
+extern Datatype CHARACTER;
+extern Datatype TWOREAL;
+extern Datatype TWODOUBLE_PRECISION;
+extern Datatype TWOINTEGER;
+#endif
+extern const Op MAX;
+extern const Op MIN;
+extern const Op SUM;
+extern const Op PROD;
+extern const Op LAND;
+extern const Op BAND;
+extern const Op LOR;
+extern const Op BOR;
+extern const Op LXOR;
+extern const Op BXOR;
+extern const Op MINLOC;
+extern const Op MAXLOC;
+extern const Op REPLACE;
+extern const Op OP_NULL;
+extern Intracomm COMM_SELF;
+extern const Group GROUP_EMPTY;
+extern const Nullcomm COMM_NULL;
+extern const Group GROUP_NULL;
+extern const Request REQUEST_NULL;
+extern const Errhandler ERRHANDLER_NULL;
+extern const Errhandler ERRORS_RETURN;
+extern const Errhandler ERRORS_ARE_FATAL;
+extern const Errhandler ERRORS_THROW_EXCEPTIONS;
+extern const Info INFO_NULL;
+extern const Win WIN_NULL;
+extern const int BSEND_OVERHEAD;
+extern const int KEYVAL_INVALID;
+extern const int CART;
+extern const int GRAPH;
+extern const int IDENT;
+extern const int SIMILAR;
+extern const int CONGRUENT;
+extern const int UNEQUAL;
+extern const int PROC_NULL;
+extern const int ANY_TAG;
+extern const int ANY_SOURCE;
+extern const int ROOT;
+extern const int TAG_UB;
+extern const int IO;
+extern const int HOST;
+extern const int WTIME_IS_GLOBAL;
+extern const int UNIVERSE_SIZE;
+extern const int LASTUSEDCODE;
+extern const int APPNUM;
+extern const int MAX_PROCESSOR_NAME;
+extern const int MAX_ERROR_STRING;
+extern const int MAX_PORT_NAME;
+extern const int MAX_OBJECT_NAME;
+extern const int MAX_INFO_VAL;
+extern const int MAX_INFO_KEY;
+extern const int UNDEFINED;
+extern const int LOCK_EXCLUSIVE;
+extern const int LOCK_SHARED;
+extern const int WIN_BASE;
+extern const int WIN_DISP_UNIT;
+extern const int WIN_SIZE;
+extern const int SUCCESS;
+extern const int ERR_BUFFER;
+extern const int ERR_COUNT;
+extern const int ERR_TYPE;
+extern const int ERR_TAG;
+extern const int ERR_COMM;
+extern const int ERR_RANK;
+extern const int ERR_REQUEST;
+extern const int ERR_ROOT;
+extern const int ERR_GROUP;
+extern const int ERR_OP;
+extern const int ERR_TOPOLOGY;
+extern const int ERR_DIMS;
+extern const int ERR_ARG;
+extern const int ERR_UNKNOWN;
+extern const int ERR_TRUNCATE;
+extern const int ERR_OTHER;
+extern const int ERR_INTERN;
+extern const int ERR_PENDING;
+extern const int ERR_IN_STATUS;
+extern const int ERR_LASTCODE;
+extern const int ERR_FILE;
+extern const int ERR_ACCESS;
+extern const int ERR_AMODE;
+extern const int ERR_BAD_FILE;
+extern const int ERR_FILE_EXISTS;
+extern const int ERR_FILE_IN_USE;
+extern const int ERR_NO_SPACE;
+extern const int ERR_NO_SUCH_FILE;
+extern const int ERR_IO;
+extern const int ERR_READ_ONLY;
+extern const int ERR_CONVERSION;
+extern const int ERR_DUP_DATAREP;
+extern const int ERR_UNSUPPORTED_DATAREP;
+extern const int ERR_INFO;
+extern const int ERR_INFO_KEY;
+extern const int ERR_INFO_VALUE;
+extern const int ERR_INFO_NOKEY;
+extern const int ERR_NAME;
+extern const int ERR_NO_MEM;
+extern const int ERR_NOT_SAME;
+extern const int ERR_PORT;
+extern const int ERR_QUOTA;
+extern const int ERR_SERVICE;
+extern const int ERR_SPAWN;
+extern const int ERR_UNSUPPORTED_OPERATION;
+extern const int ERR_WIN;
+extern const int ERR_BASE;
+extern const int ERR_LOCKTYPE;
+extern const int ERR_KEYVAL;
+extern const int ERR_RMA_CONFLICT;
+extern const int ERR_RMA_SYNC;
+extern const int ERR_SIZE;
+extern const int ERR_DISP;
+extern const int ERR_ASSERT;
+extern const int TYPECLASS_REAL;
+extern const int TYPECLASS_INTEGER;
+extern const int TYPECLASS_COMPLEX;
+#if defined(MPI_SEEK_SET) && !defined(MPICH_IGNORE_CXX_SEEK) && !defined(SEEK_SET)
+extern const int SEEK_SET;
+extern const int SEEK_END;
+extern const int SEEK_CUR;
+#endif
+extern const int DISTRIBUTE_BLOCK;
+extern const int DISTRIBUTE_CYCLIC;
+extern const int DISTRIBUTE_DFLT_DARG;
+extern const int DISTRIBUTE_NONE;
+extern const int ORDER_C;
+extern const int ORDER_FORTRAN;
+// Include these only if MPI-IO is available
+#ifdef MPI_MODE_RDONLY
+extern const int MAX_DATAREP_STRING;
+extern const MPI_Offset DISPLACEMENT_CURRENT;
+extern const int MODE_APPEND;
+extern const int MODE_CREATE;
+extern const int MODE_DELETE_ON_CLOSE;
+extern const int MODE_EXCL;
+extern const int MODE_RDONLY;
+extern const int MODE_RDWR;
+extern const int MODE_SEQUENTIAL;
+extern const int MODE_UNIQUE_OPEN;
+extern const int MODE_WRONLY;
+#endif // IO
+extern const int MODE_NOCHECK;
+extern const int MODE_NOPRECEDE;
+extern const int MODE_NOPUT;
+extern const int MODE_NOSTORE;
+extern const int MODE_NOSUCCEED;
+extern const int COMM_TYPE_SHARED;
+extern const int COMBINER_CONTIGUOUS;
+extern const int COMBINER_DARRAY;
+extern const int COMBINER_DUP;
+extern const int COMBINER_F90_COMPLEX;
+extern const int COMBINER_F90_INTEGER;
+extern const int COMBINER_F90_REAL;
+extern const int COMBINER_HINDEXED_INTEGER;
+extern const int COMBINER_HINDEXED;
+extern const int COMBINER_HVECTOR_INTEGER;
+extern const int COMBINER_HVECTOR;
+extern const int COMBINER_INDEXED_BLOCK;
+extern const int COMBINER_INDEXED;
+extern const int COMBINER_NAMED;
+extern const int COMBINER_RESIZED;
+extern const int COMBINER_STRUCT_INTEGER;
+extern const int COMBINER_STRUCT;
+extern const int COMBINER_SUBARRAY;
+extern const int COMBINER_VECTOR;
+extern const int COMBINER_HINDEXED_BLOCK;
+extern const int THREAD_FUNNELED;
+extern const int THREAD_MULTIPLE;
+extern const int THREAD_SERIALIZED;
+extern const int THREAD_SINGLE;
+extern const char ** const ARGV_NULL;
+extern const char *** const ARGVS_NULL;
+extern void * const BOTTOM;
+extern void * const IN_PLACE;
+extern void Init(void);
+extern void Init(int &, char **& );
+extern int Init_thread(int);
+extern int Init_thread(int &, char **&, int );
+extern double Wtime(void);
+extern double Wtick(void);
+} // namespace MPI
diff --git a/include/mpio.h b/include/mpio.h
new file mode 100644
index 000000000..6b76a66df
--- /dev/null
+++ b/include/mpio.h
@@ -0,0 +1,535 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/* 
+ *
+ *   Copyright (C) 1997 University of Chicago. 
+ *   See COPYRIGHT notice in top-level directory.
+ */
+
+/* user include file for MPI-IO programs */
+
+#ifndef MPIO_INCLUDE
+#define MPIO_INCLUDE
+
+#include "mpi.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define ROMIO_VERSION 126 /* version 1.2.6 */
+
+/* define MPI-IO datatypes and constants */
+
+#ifndef MPI_FILE_DEFINED
+typedef struct ADIOI_FileD *MPI_File;
+#endif
+
+#define HAVE_MPI_GREQUEST 1
+#ifndef HAVE_MPI_GREQUEST
+typedef struct ADIOI_RequestD *MPIO_Request;  
+#else
+#define MPIO_Request MPI_Request
+#define MPIO_USES_MPI_REQUEST
+/* Also rename the MPIO routines to get the MPI versions */
+#define MPIO_Wait MPI_Wait
+#define MPIO_Test MPI_Test
+#define PMPIO_Wait PMPI_Wait
+#define PMPIO_Test PMPI_Test
+#endif
+#define MPIO_REQUEST_DEFINED
+
+#ifndef HAVE_MPI_OFFSET
+typedef long long MPI_Offset;
+/* If we needed to define MPI_Offset, then we also need to make
+   this definition. */
+#ifndef HAVE_MPI_DATAREP_FUNCTIONS
+#define HAVE_MPI_DATAREP_FUNCTIONS
+typedef int (MPI_Datarep_conversion_function)(void *, MPI_Datatype, int, 
+             void *, MPI_Offset, void *);
+typedef int (MPI_Datarep_extent_function)(MPI_Datatype datatype, MPI_Aint *,
+					  void *);
+#endif
+#endif
+
+#ifndef NEEDS_MPI_FINT
+
+#endif
+#ifdef NEEDS_MPI_FINT
+typedef int MPI_Fint; 
+#endif
+
+#ifndef HAVE_MPI_INFO
+#define HAVE_MPI_INFO
+#endif
+#ifndef HAVE_MPI_INFO
+  typedef struct MPIR_Info *MPI_Info;
+# define MPI_INFO_NULL         ((MPI_Info) 0)
+# define MPI_MAX_INFO_KEY       255
+# define MPI_MAX_INFO_VAL      1024
+#endif
+
+#define MPI_MODE_RDONLY              2  /* ADIO_RDONLY */
+#define MPI_MODE_RDWR                8  /* ADIO_RDWR  */
+#define MPI_MODE_WRONLY              4  /* ADIO_WRONLY  */
+#define MPI_MODE_CREATE              1  /* ADIO_CREATE */ 
+#define MPI_MODE_EXCL               64  /* ADIO_EXCL */
+#define MPI_MODE_DELETE_ON_CLOSE    16  /* ADIO_DELETE_ON_CLOSE */
+#define MPI_MODE_UNIQUE_OPEN        32  /* ADIO_UNIQUE_OPEN */
+#define MPI_MODE_APPEND            128  /* ADIO_APPEND */
+#define MPI_MODE_SEQUENTIAL        256  /* ADIO_SEQUENTIAL */
+
+#define MPI_DISPLACEMENT_CURRENT   -54278278
+
+#ifndef MPICH
+/* FIXME: Make sure that we get a consistent definition of MPI_FILE_NULL
+	in MPICH */
+/* MPICH defines null object handles differently */
+#define MPI_FILE_NULL           ((MPI_File) 0)
+#endif
+#define MPIO_REQUEST_NULL       ((MPIO_Request) 0)
+
+#define MPI_SEEK_SET            600
+#define MPI_SEEK_CUR            602
+#define MPI_SEEK_END            604
+
+/* Open MPI: don't define MPI_MAX_DATAREP_STRING here; it's defined in
+   OMPI's mpi.h. */
+#ifndef OPEN_MPI
+#define MPI_MAX_DATAREP_STRING  128
+#endif
+
+#ifndef HAVE_MPI_DARRAY_SUBARRAY
+#define HAVE_MPI_DARRAY_SUBARRAY
+#endif
+#ifndef HAVE_MPI_DARRAY_SUBARRAY
+#  define MPI_ORDER_C             56
+#  define MPI_ORDER_FORTRAN       57
+#  define MPI_DISTRIBUTE_BLOCK    121
+#  define MPI_DISTRIBUTE_CYCLIC   122
+#  define MPI_DISTRIBUTE_NONE     123
+#  define MPI_DISTRIBUTE_DFLT_DARG -49767
+#endif
+
+
+/* MPI-IO function prototypes */
+
+/* The compiler must support ANSI C style prototypes, otherwise 
+   long long constants (e.g. 0) may get passed as ints. */
+
+#ifndef HAVE_PRAGMA_HP_SEC_DEF
+
+/* Section 9.2 */
+/* Begin Prototypes */
+int MPI_File_open(MPI_Comm comm, const char *filename, int amode, MPI_Info info, MPI_File *fh);
+int MPI_File_close(MPI_File *fh);
+int MPI_File_delete(const char *filename, MPI_Info info);
+int MPI_File_set_size(MPI_File fh, MPI_Offset size);
+int MPI_File_preallocate(MPI_File fh, MPI_Offset size);
+int MPI_File_get_size(MPI_File fh, MPI_Offset *size);
+int MPI_File_get_group(MPI_File fh, MPI_Group *group);
+int MPI_File_get_amode(MPI_File fh, int *amode);
+int MPI_File_set_info(MPI_File fh, MPI_Info info);
+int MPI_File_get_info(MPI_File fh, MPI_Info *info_used);
+
+/* Section 9.3 */
+int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, MPI_Datatype filetype,
+                      const char *datarep, MPI_Info info);
+int MPI_File_get_view(MPI_File fh, MPI_Offset *disp, MPI_Datatype *etype, MPI_Datatype *filetype,
+                      char *datarep);
+
+/* Section 9.4.2 */
+int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype,
+                     MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int MPI_File_read_at_all(MPI_File fh, MPI_Offset offset, void * buf, int count,
+                         MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int MPI_File_write_at(MPI_File fh, MPI_Offset offset, const void * buf, int count,
+                      MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, const void *buf, int count,
+                          MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+
+/* nonblocking calls currently use MPIO_Request, because generalized
+   requests not yet implemented. For the same reason, MPIO_Test and 
+   MPIO_Wait are used to test and wait on nonblocking I/O requests */ 
+int MPI_File_iread_at(MPI_File fh, MPI_Offset offset, void *buf, int count, MPI_Datatype datatype,
+                      MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int MPI_File_iwrite_at(MPI_File fh, MPI_Offset offset, const void *buf, int count,
+                       MPI_Datatype datatype, MPIO_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+
+/* Section 9.4.3 */
+int MPI_File_read(MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_read_all(MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_write(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                   MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_write_all(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                       MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+
+/* nonblocking calls currently use MPIO_Request, because generalized
+   requests not yet implemented. For the same reason, MPIO_Test and 
+   MPIO_Wait are used to test and wait on nonblocking I/O requests */ 
+
+int MPI_File_iread(MPI_File fh, void *buf, int count, MPI_Datatype datatype, MPIO_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_iwrite(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                    MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+
+int MPI_File_seek(MPI_File fh, MPI_Offset offset, int whence);
+int MPI_File_get_position(MPI_File fh, MPI_Offset *offset);
+int MPI_File_get_byte_offset(MPI_File fh, MPI_Offset offset, MPI_Offset *disp);
+
+/* Section 9.4.4 */
+int MPI_File_read_shared(MPI_File fh, void *buf, int count, MPI_Datatype datatype,
+                         MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_write_shared(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                          MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_iread_shared(MPI_File fh, void *buf, int count, MPI_Datatype datatype,
+                          MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_iwrite_shared(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                           MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_read_ordered(MPI_File fh, void *buf, int count, MPI_Datatype datatype,
+                          MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_write_ordered(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                           MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_seek_shared(MPI_File fh, MPI_Offset offset, int whence);
+int MPI_File_get_position_shared(MPI_File fh, MPI_Offset *offset);
+
+/* Section 9.4.5 */
+int MPI_File_read_at_all_begin(MPI_File fh, MPI_Offset offset, void *buf, int count,
+                               MPI_Datatype datatype) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int MPI_File_read_at_all_end(MPI_File fh, void *buf, MPI_Status *status);
+int MPI_File_write_at_all_begin(MPI_File fh, MPI_Offset offset, const void *buf, int count,
+                                MPI_Datatype datatype) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int MPI_File_write_at_all_end(MPI_File fh, const void *buf, MPI_Status *status);
+int MPI_File_read_all_begin(MPI_File fh, void *buf, int count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_read_all_end(MPI_File fh, void *buf, MPI_Status *status);
+int MPI_File_write_all_begin(MPI_File fh, const void *buf, int count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_write_all_end(MPI_File fh, const void *buf, MPI_Status *status);
+int MPI_File_read_ordered_begin(MPI_File fh, void *buf, int count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_read_ordered_end(MPI_File fh, void *buf, MPI_Status *status);
+int MPI_File_write_ordered_begin(MPI_File fh, const void *buf, int count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_write_ordered_end(MPI_File fh, const void *buf, MPI_Status *status);
+
+/* Section 9.5.1 */
+int MPI_File_get_type_extent(MPI_File fh, MPI_Datatype datatype, MPI_Aint *extent);
+
+/* Section 9.5.3 */
+int MPI_Register_datarep(const char *datarep, MPI_Datarep_conversion_function *read_conversion_fn,
+			 MPI_Datarep_conversion_function *write_conversion_fn,
+			 MPI_Datarep_extent_function *dtype_file_extent_fn, void *extra_state);
+
+/* Section 9.6.1 */
+int MPI_File_set_atomicity(MPI_File fh, int flag);
+int MPI_File_get_atomicity(MPI_File fh, int *flag);
+int MPI_File_sync(MPI_File fh);
+
+/* Section 4.13.3 */
+#ifndef MPICH
+/* MPICH provides these definitions */
+int MPI_File_set_errhandler(MPI_File file, MPI_Errhandler errhandler);
+int MPI_File_get_errhandler(MPI_File file, MPI_Errhandler *errhandler);
+#endif
+
+/* For MPI 3.1 */
+int MPI_File_iread_at_all(MPI_File fh, MPI_Offset offset, void *buf, int count,
+                           MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int MPI_File_iwrite_at_all(MPI_File fh, MPI_Offset offset, const void *buf, int count,
+                            MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int MPI_File_iread_all(MPI_File fh, void *buf, int count, MPI_Datatype datatype,
+                        MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int MPI_File_iwrite_all(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                         MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+/* End Prototypes */
+
+#ifndef HAVE_MPI_DARRAY_SUBARRAY
+/* Section 4.14.4 */
+int MPI_Type_create_subarray(int ndims, const int array_of_sizes[], const int array_of_subsizes[],
+                             const int array_of_starts[], int order, MPI_Datatype oldtype,
+                             MPI_Datatype *newtype);
+
+/* Section 4.14.5 */
+int MPI_Type_create_darray(int size, int rank, int ndims, const int array_of_gsizes[],
+                           const int array_of_distribs[], const int array_of_dargs[],
+                           const int array_of_psizes, int order, MPI_Datatype oldtype,
+                           MPI_Datatype *newtype);
+#endif
+
+/* The globus2 device has to rename MPI_ symbols in order to use the vendor
+   MPI as one of its transport mechanisms.  Therefore, the following undefines
+   should only happen if MPICH_RENAMING_MPI_FUNCS is not defined. */
+/* Section 4.12.4 */
+#if !defined(MPICH_RENAMING_MPI_FUNCS)
+#ifdef MPI_File_f2c
+#undef MPI_File_f2c
+#endif
+#ifdef MPI_File_c2f
+#undef MPI_File_c2f
+#endif
+#endif
+/* above needed for some versions of mpi.h in MPICH!! */
+MPI_File MPI_File_f2c(MPI_Fint file);
+MPI_Fint MPI_File_c2f(MPI_File file);
+
+
+#ifndef HAVE_MPI_GREQUEST
+/* The following functions are required if generalized requests are not
+   available, because in that case, an MPIO_Request object
+   is currently used for nonblocking I/O. */
+int MPIO_Test(MPIO_Request *request, int *flag, MPI_Status *status);
+int MPIO_Wait(MPIO_Request *request, MPI_Status *status);
+int MPIO_Testall(int count, MPIO_Request array_of_requests[], int *flag,
+                 MPI_Status array_of_statuses[]);
+int MPIO_Waitall(int count, MPIO_Request array_of_requests[], MPI_Status array_of_statuses[]);
+int MPIO_Testany(int count, MPIO_Request array_of_requests[], int *indx, int *flag,
+                 MPI_Status *status);
+int MPIO_Waitany(int count, MPIO_Request array_of_requests[], int *indx, MPI_Status *status);
+int MPIO_Waitsome(int incount, MPIO_Request array_of_requests[], int *outcount,
+                  int array_of_indices[], MPI_Status array_of_statuses[]);
+int MPIO_Testsome(int incount, MPIO_Request array_of_requests[], int *outcount,
+                  int array_of_indices[], MPI_Status array_of_statuses[]);
+
+MPI_Fint MPIO_Request_c2f(MPIO_Request request);
+MPIO_Request MPIO_Request_f2c(MPI_Fint request);
+#endif /* HAVE_MPI_GREQUEST */
+
+/* info functions if not defined in the MPI implementation */
+#ifndef HAVE_MPI_INFO
+
+int MPI_Info_create(MPI_Info *info);
+int MPI_Info_set(MPI_Info info, const char *key, const char *value);
+int MPI_Info_delete(MPI_Info info, const char *key);
+int MPI_Info_get(MPI_Info info, const char *key, int valuelen, char *value, int *flag);
+int MPI_Info_get_valuelen(MPI_Info info, const char *key, int *valuelen, int *flag);
+int MPI_Info_get_nkeys(MPI_Info info, int *nkeys);
+int MPI_Info_get_nthkey(MPI_Info info, int n, char *key);
+int MPI_Info_dup(MPI_Info info, MPI_Info *newinfo);
+int MPI_Info_free(MPI_Info *info);
+
+/* The globus2 device has to rename MPI_ symbols in order to use the vendor
+   MPI as one of its transport mechanisms.  Therefore, the following undefines
+   should only happen if MPICH_RENAMING_MPI_FUNCS is not defined. */
+#if !defined(MPICH_RENAMING_MPI_FUNCS)
+#ifdef MPI_Info_f2c
+#undef MPI_Info_f2c
+#endif
+#ifdef MPI_Info_c2f
+#undef MPI_Info_c2f
+#endif
+#endif
+/* above needed for some versions of mpi.h in MPICH!! */
+MPI_Fint MPI_Info_c2f(MPI_Info info);
+MPI_Info MPI_Info_f2c(MPI_Fint info);
+#endif
+
+#endif   /* HAVE_PRAGMA_HP_SEC_DEF */
+
+
+/**************** BINDINGS FOR THE PROFILING INTERFACE ***************/
+
+
+/* Section 9.2 */
+int PMPI_File_open(MPI_Comm, const char *, int, MPI_Info, MPI_File *);
+int PMPI_File_close(MPI_File *);
+int PMPI_File_delete(const char *, MPI_Info);
+int PMPI_File_set_size(MPI_File, MPI_Offset);
+int PMPI_File_preallocate(MPI_File, MPI_Offset);
+int PMPI_File_get_size(MPI_File, MPI_Offset *);
+int PMPI_File_get_group(MPI_File, MPI_Group *);
+int PMPI_File_get_amode(MPI_File, int *);
+int PMPI_File_set_info(MPI_File, MPI_Info);
+int PMPI_File_get_info(MPI_File, MPI_Info *);
+
+/* Section 9.3 */
+int PMPI_File_set_view(MPI_File, MPI_Offset, 
+    MPI_Datatype, MPI_Datatype, const char *, MPI_Info);
+int PMPI_File_get_view(MPI_File, MPI_Offset *, 
+      MPI_Datatype *, MPI_Datatype *, char *);
+
+/* Section 9.4.2 */
+int PMPI_File_read_at(MPI_File, MPI_Offset, void *,
+	      int, MPI_Datatype, MPI_Status *)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int PMPI_File_read_at_all(MPI_File, MPI_Offset, void *,
+	      int, MPI_Datatype, MPI_Status *)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int PMPI_File_write_at(MPI_File, MPI_Offset, const void *,
+	      int, MPI_Datatype, MPI_Status *)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int PMPI_File_write_at_all(MPI_File, MPI_Offset, const void *,
+	      int, MPI_Datatype, MPI_Status *)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+
+/* nonblocking calls currently use MPIO_Request, because generalized
+   requests not yet implemented. For the same reason, MPIO_Test and 
+   MPIO_Wait are used to test and wait on nonblocking I/O requests */ 
+
+int PMPI_File_iread_at(MPI_File, MPI_Offset, void *,
+	      int, MPI_Datatype, MPIO_Request *)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int PMPI_File_iwrite_at(MPI_File, MPI_Offset, const void *,
+	      int, MPI_Datatype, MPIO_Request *)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+
+/* Section 9.4.3 */
+int PMPI_File_read(MPI_File, void *, int, MPI_Datatype, MPI_Status *)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_read_all(MPI_File, void *, int, MPI_Datatype, MPI_Status *)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_write(MPI_File, const void *, int, MPI_Datatype, MPI_Status *)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_write_all(MPI_File, const void *, int, MPI_Datatype, MPI_Status *)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+
+/* nonblocking calls currently use MPIO_Request, because generalized
+   requests not yet implemented. For the same reason, MPIO_Test and 
+   MPIO_Wait are used to test and wait on nonblocking I/O requests */ 
+
+int PMPI_File_iread(MPI_File, void *, int, MPI_Datatype, MPIO_Request *)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_iwrite(MPI_File, const void *, int, MPI_Datatype, MPIO_Request *)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+
+int PMPI_File_seek(MPI_File, MPI_Offset, int);
+int PMPI_File_get_position(MPI_File, MPI_Offset *);
+int PMPI_File_get_byte_offset(MPI_File, MPI_Offset, MPI_Offset *);
+
+/* Section 9.4.4 */
+int PMPI_File_read_shared(MPI_File, void *, int, MPI_Datatype, MPI_Status *)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_write_shared(MPI_File, const void *, int, MPI_Datatype, MPI_Status *)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_iread_shared(MPI_File, void *, int, 
+			   MPI_Datatype, MPIO_Request *)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_iwrite_shared(MPI_File, const void *, int,
+			    MPI_Datatype, MPIO_Request *)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_read_ordered(MPI_File, void *, int, MPI_Datatype, MPI_Status *)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_write_ordered(MPI_File, const void *, int, MPI_Datatype, MPI_Status *)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_seek_shared(MPI_File, MPI_Offset, int);
+int PMPI_File_get_position_shared(MPI_File, MPI_Offset *);
+
+/* Section 9.4.5 */
+int PMPI_File_read_at_all_begin(MPI_File, MPI_Offset, void *,
+                               int, MPI_Datatype)
+                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int PMPI_File_read_at_all_end(MPI_File, void *, MPI_Status *);
+int PMPI_File_write_at_all_begin(MPI_File, MPI_Offset, const void *,
+                                 int, MPI_Datatype)
+                                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int PMPI_File_write_at_all_end(MPI_File, const void *, MPI_Status *);
+int PMPI_File_read_all_begin(MPI_File, void *, int, MPI_Datatype)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_read_all_end(MPI_File, void *, MPI_Status *);
+int PMPI_File_write_all_begin(MPI_File, const void *, int, MPI_Datatype)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_write_all_end(MPI_File, const void *, MPI_Status *);
+int PMPI_File_read_ordered_begin(MPI_File, void *, int, MPI_Datatype)
+                                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_read_ordered_end(MPI_File, void *, MPI_Status *);
+int PMPI_File_write_ordered_begin(MPI_File, const void *, int, MPI_Datatype)
+                                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_write_ordered_end(MPI_File, const void *, MPI_Status *);
+
+/* Section 9.5.1 */
+int PMPI_File_get_type_extent(MPI_File, MPI_Datatype, MPI_Aint *);
+
+/* Section 9.5.3 */
+int PMPI_Register_datarep(const char *,
+			 MPI_Datarep_conversion_function *,
+			 MPI_Datarep_conversion_function *,
+			 MPI_Datarep_extent_function *,
+			 void *);
+
+/* Section 9.6.1 */
+int PMPI_File_set_atomicity(MPI_File, int);
+int PMPI_File_get_atomicity(MPI_File, int *);
+int PMPI_File_sync(MPI_File);
+
+/* Section 4.13.3 */
+#ifndef MPICH
+/* MPICH provides these definitions */
+int PMPI_File_set_errhandler( MPI_File, MPI_Errhandler );
+int PMPI_File_get_errhandler( MPI_File, MPI_Errhandler * );
+#endif
+
+/* For MPI 3.1 */
+int PMPI_File_iread_at_all(MPI_File fh, MPI_Offset offset, void *buf, int count,
+                            MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int PMPI_File_iwrite_at_all(MPI_File fh, MPI_Offset offset, const void *buf, int count,
+                             MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5);
+int PMPI_File_iread_all(MPI_File fh, void *buf, int count, MPI_Datatype datatype,
+                         MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+int PMPI_File_iwrite_all(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
+                          MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4);
+
+#ifndef HAVE_MPI_DARRAY_SUBARRAY
+/* Section 4.14.4 */
+int PMPI_Type_create_subarray(int, int *, int *, int *, int, 
+                      MPI_Datatype, MPI_Datatype *);
+
+/* Section 4.14.5 */
+int PMPI_Type_create_darray(int, int, int, int *, int *, 
+                    int *, int *, int, MPI_Datatype, MPI_Datatype *);
+#endif
+
+/* Section 4.12.4 */
+MPI_File PMPI_File_f2c(MPI_Fint);
+MPI_Fint PMPI_File_c2f(MPI_File);
+
+#ifndef HAVE_MPI_GREQUEST
+/* The following functions are required if generalized requests are not
+   available, because in that case, an MPIO_Request object
+   is currently used for nonblocking I/O. */
+int PMPIO_Test(MPIO_Request *, int *, MPI_Status *);
+int PMPIO_Wait(MPIO_Request *, MPI_Status *);
+int PMPIO_Testall(int, MPIO_Request *, int *, MPI_Status *);
+int PMPIO_Waitall(int, MPIO_Request *, MPI_Status *);
+int PMPIO_Testany(int, MPIO_Request *, int *, int *, MPI_Status *);
+int PMPIO_Waitany(int, MPIO_Request *, int *, MPI_Status *);
+int PMPIO_Waitsome(int, MPIO_Request *, int *, int *, MPI_Status *);
+int PMPIO_Testsome(int, MPIO_Request *, int *, int *, MPI_Status *);
+MPI_Fint PMPIO_Request_c2f(MPIO_Request);
+MPIO_Request PMPIO_Request_f2c(MPI_Fint);
+#endif /* HAVE_MPI_GREQUEST */
+
+/* info functions if not defined in the MPI implementation */
+#ifndef HAVE_MPI_INFO
+
+int PMPI_Info_create(MPI_Info *);
+int PMPI_Info_set(MPI_Info, char *, char *);
+int PMPI_Info_delete(MPI_Info, char *);
+int PMPI_Info_get(MPI_Info, char *, int, char *, int *);
+int PMPI_Info_get_valuelen(MPI_Info, char *, int *, int *);
+int PMPI_Info_get_nkeys(MPI_Info, int *);
+int PMPI_Info_get_nthkey(MPI_Info, int, char *);
+int PMPI_Info_dup(MPI_Info, MPI_Info *);
+int PMPI_Info_free(MPI_Info *);
+
+MPI_Fint PMPI_Info_c2f(MPI_Info);
+MPI_Info PMPI_Info_f2c(MPI_Fint);
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/mkldnn.commit b/mkldnn.commit
index 9abcb727c..a84e417bc 100644
--- a/mkldnn.commit
+++ b/mkldnn.commit
@@ -1 +1 @@
-b01e3a55a07be62172e713bcd2644c5176360212
+472bbbf05ce5ff5c072811220c55cf9b5bbd96ad
diff --git a/models/intel_optimized_models/alexnet/bdw/solver_dummydata.prototxt b/models/intel_optimized_models/alexnet/bdw/solver_dummydata.prototxt
new file mode 100644
index 000000000..79e5061f9
--- /dev/null
+++ b/models/intel_optimized_models/alexnet/bdw/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/alexnet/bdw/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/alexnet/bdw/alexnet_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/alexnet/knl/solver_dummydata.prototxt b/models/intel_optimized_models/alexnet/knl/solver_dummydata.prototxt
new file mode 100644
index 000000000..b10739bff
--- /dev/null
+++ b/models/intel_optimized_models/alexnet/knl/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/alexnet/knl/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/alexnet/knl/alexnet_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/alexnet/skx/solver_dummydata.prototxt b/models/intel_optimized_models/alexnet/skx/solver_dummydata.prototxt
new file mode 100644
index 000000000..d7b5a2732
--- /dev/null
+++ b/models/intel_optimized_models/alexnet/skx/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/alexnet/skx/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/alexnet/skx/alexnet_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/faster-rcnn/README.md b/models/intel_optimized_models/faster-rcnn/README.md
new file mode 100644
index 000000000..049297813
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/README.md
@@ -0,0 +1,5 @@
+## Faster R-CNN 
+
+This folder stores the prototxt files of Faster-RCNN on different front-end nets, such as VGG16 and ZFNet.
+
+For the detail example of how to use these model files, you can refer to examples/faster-rcnn.
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG16/fast_rcnn/solver.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG16/fast_rcnn/solver.prototxt
new file mode 100644
index 000000000..cf4ba84d7
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG16/fast_rcnn/solver.prototxt
@@ -0,0 +1,16 @@
+train_net: "../../models/intel_optimized_models/faster-rcnn/coco/VGG16/fast_rcnn/train.prototxt"
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 200000
+display: 20
+average_loss: 100
+# iter_size: 1
+momentum: 0.9
+weight_decay: 0.0005
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg16_fast_rcnn"
+#debug_info: true
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG16/fast_rcnn/test.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG16/fast_rcnn/test.prototxt
new file mode 100644
index 000000000..5bc1e9924
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG16/fast_rcnn/test.prototxt
@@ -0,0 +1,499 @@
+name: "VGG_ILSVRC_16_layers"
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+
+input: "rois"
+input_shape {
+  dim: 1 # to be changed on-the-fly to num ROIs
+  dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing
+}
+
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 81
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 324
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG16/fast_rcnn/train.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG16/fast_rcnn/train.prototxt
new file mode 100644
index 000000000..1bba3988d
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG16/fast_rcnn/train.prototxt
@@ -0,0 +1,485 @@
+name: "VGG_ILSVRC_16_layers"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 81"
+  }
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 81
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 324
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG16/faster_rcnn_end2end/solver.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG16/faster_rcnn_end2end/solver.prototxt
new file mode 100644
index 000000000..2e9d5850a
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG16/faster_rcnn_end2end/solver.prototxt
@@ -0,0 +1,15 @@
+train_net: "../../models/intel_optimized_models/faster-rcnn/coco/VGG16/faster_rcnn_end2end/train.prototxt"
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 350000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg16_faster_rcnn"
+iter_size: 2
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG16/faster_rcnn_end2end/test.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG16/faster_rcnn_end2end/test.prototxt
new file mode 100644
index 000000000..a700a5250
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG16/faster_rcnn_end2end/test.prototxt
@@ -0,0 +1,590 @@
+name: "VGG_ILSVRC_16_layers"
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5_3"
+  top: "rpn/output"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 24   # 2(bg/fg) * 12(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 48   # 4 * 12(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 24 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 81
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 324
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG16/faster_rcnn_end2end/train.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG16/faster_rcnn_end2end/train.prototxt
new file mode 100644
index 000000000..d36c92df7
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG16/faster_rcnn_end2end/train.prototxt
@@ -0,0 +1,642 @@
+name: "VGG_ILSVRC_16_layers"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 81"
+  }
+}
+
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5_3"
+  top: "rpn/output"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 512
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 24   # 2(bg/fg) * 12(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 48   # 4 * 12(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]"
+  }
+}
+
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: 'rpn_bbox_inside_weights'
+  bottom: 'rpn_bbox_outside_weights'
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 24 dim: -1 dim: 0 } }
+}
+
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rpn_rois'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]"
+  }
+}
+
+layer {
+  name: 'roi-data'
+  type: 'Python'
+  bottom: 'rpn_rois'
+  bottom: 'gt_boxes'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'rpn.proposal_target_layer'
+    layer: 'ProposalTargetLayer'
+    param_str: "'num_classes': 81"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 81
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 324
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/fast_rcnn/solver.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/fast_rcnn/solver.prototxt
new file mode 100644
index 000000000..50539f6e8
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/fast_rcnn/solver.prototxt
@@ -0,0 +1,15 @@
+train_net: "../../models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/fast_rcnn/train.prototxt"
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 200000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn"
+#debug_info: true
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/fast_rcnn/test.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/fast_rcnn/test.prototxt
new file mode 100644
index 000000000..733a75942
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/fast_rcnn/test.prototxt
@@ -0,0 +1,299 @@
+name: "VGG_CNN_M_1024"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+input: "rois"
+input_shape {
+  dim: 1 # to be changed on-the-fly to num ROIs
+  dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 5
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 81
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 324
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/fast_rcnn/train.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/fast_rcnn/train.prototxt
new file mode 100644
index 000000000..2f874399a
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/fast_rcnn/train.prototxt
@@ -0,0 +1,292 @@
+name: "VGG_CNN_M_1024"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 81"
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 5
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 81
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 324
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/faster_rcnn_end2end/solver.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/faster_rcnn_end2end/solver.prototxt
new file mode 100644
index 000000000..7fc074b06
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/faster_rcnn_end2end/solver.prototxt
@@ -0,0 +1,14 @@
+train_net: "../../models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt"
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 350000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg_cnn_m_1024_faster_rcnn"
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/faster_rcnn_end2end/test.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/faster_rcnn_end2end/test.prototxt
new file mode 100644
index 000000000..ddc633e1a
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/faster_rcnn_end2end/test.prototxt
@@ -0,0 +1,432 @@
+name: "VGG_CNN_M_1024"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 5
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+#layer {
+#  name: "rpn_conv/3x3"
+#  type: "Convolution"
+#  bottom: "conv5"
+#  top: "rpn_conv/3x3"
+#  param { lr_mult: 1.0 decay_mult: 1.0 }
+#  param { lr_mult: 2.0 decay_mult: 0 }
+#  convolution_param {
+#    num_output: 192
+#    kernel_size: 3 pad: 1 stride: 1
+#    weight_filler { type: "gaussian" std: 0.01 }
+#    bias_filler { type: "constant" value: 0 }
+#  }
+#}
+#layer {
+#  name: "rpn_conv/5x5"
+#  type: "Convolution"
+#  bottom: "conv5"
+#  top: "rpn_conv/5x5"
+#  param { lr_mult: 1.0 decay_mult: 1.0 }
+#  param { lr_mult: 2.0 decay_mult: 0 }
+#  convolution_param {
+#    num_output: 64
+#    kernel_size: 5 pad: 2 stride: 1
+#    weight_filler { type: "gaussian" std: 0.0036 }
+#    bias_filler { type: "constant" value: 0 }
+#  }
+#}
+#layer {
+#  name: "rpn/output"
+#  type: "Concat"
+#  bottom: "rpn_conv/3x3"
+#  bottom: "rpn_conv/5x5"
+#  top: "rpn/output"
+#}
+#layer {
+#  name: "rpn_relu/output"
+#  type: "ReLU"
+#  bottom: "rpn/output"
+#  top: "rpn/output"
+#}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 24  # 2(bg/fg) * 12(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 48   # 4 * 12(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 24 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 81
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 324
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt
new file mode 100644
index 000000000..a15e7b34c
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/coco/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt
@@ -0,0 +1,453 @@
+name: "VGG_CNN_M_1024"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 81"
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 5
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 24   # 2(bg/fg) * 12(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 48   # 4 * 12(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]"
+  }
+}
+
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: 'rpn_bbox_inside_weights'
+  bottom: 'rpn_bbox_outside_weights'
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 24 dim: -1 dim: 0 } }
+}
+
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rpn_rois'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16 \n'scales': !!python/tuple [4, 8, 16, 32]"
+  }
+}
+
+layer {
+  name: 'roi-data'
+  type: 'Python'
+  bottom: 'rpn_rois'
+  bottom: 'gt_boxes'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'rpn.proposal_target_layer'
+    layer: 'ProposalTargetLayer'
+    param_str: "'num_classes': 81"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 81
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 324
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/fast_rcnn/solver.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/fast_rcnn/solver.prototxt
new file mode 100644
index 000000000..f82873f65
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/fast_rcnn/solver.prototxt
@@ -0,0 +1,16 @@
+train_net: "../../models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/fast_rcnn/train.prototxt"
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 30000
+display: 20
+average_loss: 100
+# iter_size: 1
+momentum: 0.9
+weight_decay: 0.0005
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg16_fast_rcnn"
+#debug_info: true
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/fast_rcnn/test.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/fast_rcnn/test.prototxt
new file mode 100644
index 000000000..fec236986
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/fast_rcnn/test.prototxt
@@ -0,0 +1,517 @@
+name: "VGG_ILSVRC_16_layers"
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+
+input: "rois"
+input_shape {
+  dim: 1 # to be changed on-the-fly to num ROIs
+  dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing
+}
+
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/fast_rcnn/train.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/fast_rcnn/train.prototxt
new file mode 100644
index 000000000..2e7958f28
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/fast_rcnn/train.prototxt
@@ -0,0 +1,503 @@
+name: "VGG_ILSVRC_16_layers"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/faster_rcnn_test.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/faster_rcnn_test.pt
new file mode 100644
index 000000000..c264bfdf9
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/faster_rcnn_test.pt
@@ -0,0 +1,409 @@
+name: "VGG_ILSVRC_16_layers"
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  convolution_param {
+    num_output: 64
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  convolution_param {
+    num_output: 64
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2 stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  convolution_param {
+    num_output: 128
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  convolution_param {
+    num_output: 128
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2 stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  convolution_param {
+    num_output: 256
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  convolution_param {
+    num_output: 256
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  convolution_param {
+    num_output: 256
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2 stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2 stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5_3"
+  top: "rpn/output"
+  convolution_param {
+    num_output: 512
+    kernel_size: 3 pad: 1 stride: 1
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  inner_product_param {
+    num_output: 21
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  inner_product_param {
+    num_output: 84
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/rpn_test.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/rpn_test.pt
new file mode 100644
index 000000000..0e7b1dacf
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/rpn_test.pt
@@ -0,0 +1,341 @@
+name: "VGG_ILSVRC_16_layers"
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  convolution_param {
+    num_output: 64
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  convolution_param {
+    num_output: 64
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2 stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  convolution_param {
+    num_output: 128
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  convolution_param {
+    num_output: 128
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2 stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  convolution_param {
+    num_output: 256
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  convolution_param {
+    num_output: 256
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  convolution_param {
+    num_output: 256
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2 stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2 stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5_3"
+  top: "rpn/output"
+  convolution_param {
+    num_output: 512
+    kernel_size: 3 pad: 1 stride: 1
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  top: 'scores'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt
new file mode 100644
index 000000000..82df70aa3
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 30000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg16_fast_rcnn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt
new file mode 100644
index 000000000..628dc8517
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt
@@ -0,0 +1,542 @@
+name: "VGG_ILSVRC_16_layers"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
+
+#========= RPN ============
+# Dummy layers so that initial parameters are saved into the output net
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5_3"
+  top: "rpn/output"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "silence_rpn_cls_score"
+  type: "Silence"
+  bottom: "rpn_cls_score"
+}
+layer {
+  name: "silence_rpn_bbox_pred"
+  type: "Silence"
+  bottom: "rpn_bbox_pred"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt
new file mode 100644
index 000000000..a6e29f0f1
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 60000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg16_rpn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_train.pt
new file mode 100644
index 000000000..e5f3c50b6
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_train.pt
@@ -0,0 +1,525 @@
+name: "VGG_ILSVRC_16_layers"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5_3"
+  top: "rpn/output"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 512
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: 'rpn_bbox_inside_weights'
+  bottom: 'rpn_bbox_outside_weights'
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RCNN ============
+# Dummy layers so that initial parameters are saved into the output net
+
+layer {
+  name: "dummy_roi_pool_conv5"
+  type: "DummyData"
+  top: "dummy_roi_pool_conv5"
+  dummy_data_param {
+    shape { dim: 1 dim: 25088 }
+    data_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "dummy_roi_pool_conv5"
+  top: "fc6"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "silence_fc7"
+  type: "Silence"
+  bottom: "fc7"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt
new file mode 100644
index 000000000..861536c6a
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 30000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg16_fast_rcnn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt
new file mode 100644
index 000000000..e38bb2bef
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt
@@ -0,0 +1,490 @@
+name: "VGG_ILSVRC_16_layers"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
+
+#========= RPN ============
+# Dummy layers so that initial parameters are saved into the output net
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5_3"
+  top: "rpn/output"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "silence_rpn_cls_score"
+  type: "Silence"
+  bottom: "rpn_cls_score"
+}
+layer {
+  name: "silence_rpn_bbox_pred"
+  type: "Silence"
+  bottom: "rpn_bbox_pred"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt
new file mode 100644
index 000000000..7199df89b
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 60000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg16_rpn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_train.pt
new file mode 100644
index 000000000..6d82857b9
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_train.pt
@@ -0,0 +1,465 @@
+name: "VGG_ILSVRC_16_layers"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5_3"
+  top: "rpn/output"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 512
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: 'rpn_bbox_inside_weights'
+  bottom: 'rpn_bbox_outside_weights'
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RCNN ============
+# Dummy layers so that initial parameters are saved into the output net
+
+layer {
+  name: "dummy_roi_pool_conv5"
+  type: "DummyData"
+  top: "dummy_roi_pool_conv5"
+  dummy_data_param {
+    shape { dim: 1 dim: 25088 }
+    data_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "dummy_roi_pool_conv5"
+  top: "fc6"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "silence_fc7"
+  type: "Silence"
+  bottom: "fc7"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_end2end/solver.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_end2end/solver.prototxt
new file mode 100644
index 000000000..87234dae7
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_end2end/solver.prototxt
@@ -0,0 +1,16 @@
+train_net: "../../models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_end2end/train.prototxt"
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 50000
+display: 20
+average_loss: 100
+# iter_size: 1
+momentum: 0.9
+weight_decay: 0.0005
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg16_faster_rcnn"
+iter_size: 2
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_end2end/test.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_end2end/test.prototxt
new file mode 100644
index 000000000..4a938208b
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_end2end/test.prototxt
@@ -0,0 +1,608 @@
+name: "VGG_ILSVRC_16_layers"
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5_3"
+  top: "rpn/output"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_end2end/train.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_end2end/train.prototxt
new file mode 100644
index 000000000..ebadb49b7
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG16/faster_rcnn_end2end/train.prototxt
@@ -0,0 +1,673 @@
+name: "VGG_ILSVRC_16_layers"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+
+layer {
+  name: "conv1_1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_1"
+  type: "ReLU"
+  bottom: "conv1_1"
+  top: "conv1_1"
+}
+layer {
+  name: "conv1_2"
+  type: "Convolution"
+  bottom: "conv1_1"
+  top: "conv1_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu1_2"
+  type: "ReLU"
+  bottom: "conv1_2"
+  top: "conv1_2"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1_2"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv2_1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2_1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_1"
+  type: "ReLU"
+  bottom: "conv2_1"
+  top: "conv2_1"
+}
+layer {
+  name: "conv2_2"
+  type: "Convolution"
+  bottom: "conv2_1"
+  top: "conv2_2"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu2_2"
+  type: "ReLU"
+  bottom: "conv2_2"
+  top: "conv2_2"
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "conv2_2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv3_1"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_1"
+  type: "ReLU"
+  bottom: "conv3_1"
+  top: "conv3_1"
+}
+layer {
+  name: "conv3_2"
+  type: "Convolution"
+  bottom: "conv3_1"
+  top: "conv3_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_2"
+  type: "ReLU"
+  bottom: "conv3_2"
+  top: "conv3_2"
+}
+layer {
+  name: "conv3_3"
+  type: "Convolution"
+  bottom: "conv3_2"
+  top: "conv3_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3_3"
+  type: "ReLU"
+  bottom: "conv3_3"
+  top: "conv3_3"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "conv3_3"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv4_1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "conv4_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_1"
+  type: "ReLU"
+  bottom: "conv4_1"
+  top: "conv4_1"
+}
+layer {
+  name: "conv4_2"
+  type: "Convolution"
+  bottom: "conv4_1"
+  top: "conv4_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_2"
+  type: "ReLU"
+  bottom: "conv4_2"
+  top: "conv4_2"
+}
+layer {
+  name: "conv4_3"
+  type: "Convolution"
+  bottom: "conv4_2"
+  top: "conv4_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4_3"
+  type: "ReLU"
+  bottom: "conv4_3"
+  top: "conv4_3"
+}
+layer {
+  name: "pool4"
+  type: "Pooling"
+  bottom: "conv4_3"
+  top: "pool4"
+  pooling_param {
+    pool: MAX
+    kernel_size: 2
+    stride: 2
+  }
+}
+layer {
+  name: "conv5_1"
+  type: "Convolution"
+  bottom: "pool4"
+  top: "conv5_1"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_1"
+  type: "ReLU"
+  bottom: "conv5_1"
+  top: "conv5_1"
+}
+layer {
+  name: "conv5_2"
+  type: "Convolution"
+  bottom: "conv5_1"
+  top: "conv5_2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_2"
+  type: "ReLU"
+  bottom: "conv5_2"
+  top: "conv5_2"
+}
+layer {
+  name: "conv5_3"
+  type: "Convolution"
+  bottom: "conv5_2"
+  top: "conv5_3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5_3"
+  type: "ReLU"
+  bottom: "conv5_3"
+  top: "conv5_3"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5_3"
+  top: "rpn/output"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 512
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: 'rpn_bbox_inside_weights'
+  bottom: 'rpn_bbox_outside_weights'
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rpn_rois'
+#  top: 'rpn_scores'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+#layer {
+#  name: 'debug-data'
+#  type: 'Python'
+#  bottom: 'data'
+#  bottom: 'rpn_rois'
+#  bottom: 'rpn_scores'
+#  python_param {
+#    module: 'rpn.debug_layer'
+#    layer: 'RPNDebugLayer'
+#  }
+#}
+
+layer {
+  name: 'roi-data'
+  type: 'Python'
+  bottom: 'rpn_rois'
+  bottom: 'gt_boxes'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'rpn.proposal_target_layer'
+    layer: 'ProposalTargetLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5_3"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 7
+    pooled_h: 7
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/fast_rcnn/solver.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/fast_rcnn/solver.prototxt
new file mode 100644
index 000000000..608935334
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/fast_rcnn/solver.prototxt
@@ -0,0 +1,15 @@
+train_net: "../../models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/fast_rcnn/train.prototxt"
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 30000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn"
+#debug_info: true
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/fast_rcnn/test.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/fast_rcnn/test.prototxt
new file mode 100644
index 000000000..baeac3662
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/fast_rcnn/test.prototxt
@@ -0,0 +1,317 @@
+name: "VGG_CNN_M_1024"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+input: "rois"
+input_shape {
+  dim: 1 # to be changed on-the-fly to num ROIs
+  dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 5
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/fast_rcnn/train.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/fast_rcnn/train.prototxt
new file mode 100644
index 000000000..d7023671f
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/fast_rcnn/train.prototxt
@@ -0,0 +1,310 @@
+name: "VGG_CNN_M_1024"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 5
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/faster_rcnn_test.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/faster_rcnn_test.pt
new file mode 100644
index 000000000..954b2763d
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/faster_rcnn_test.pt
@@ -0,0 +1,289 @@
+name: "VGG_CNN_M_1024"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  convolution_param {
+    num_output: 96
+    kernel_size: 7
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 5
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  inner_product_param {
+    num_output: 21
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  inner_product_param {
+    num_output: 84
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/rpn_test.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/rpn_test.pt
new file mode 100644
index 000000000..fdf373faa
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/rpn_test.pt
@@ -0,0 +1,221 @@
+name: "VGG_CNN_M_1024"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  convolution_param {
+    num_output: 96
+    kernel_size: 7
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 5
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  top: 'scores'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt
new file mode 100644
index 000000000..8444a3ecc
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 30000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt
new file mode 100644
index 000000000..1bc15349a
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt
@@ -0,0 +1,337 @@
+name: "VGG_CNN_M_1024"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7 stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3 stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  convolution_param {
+    num_output: 256
+    pad: 1 kernel_size: 5 stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3 stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
+
+#========= RPN ============
+# Dummy layers so that initial parameters are saved into the output net
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "silence_rpn_cls_score"
+  type: "Silence"
+  bottom: "rpn_cls_score"
+}
+layer {
+  name: "silence_rpn_bbox_pred"
+  type: "Silence"
+  bottom: "rpn_bbox_pred"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt
new file mode 100644
index 000000000..6bea5fcb9
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 60000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg_cnn_m_1024_rpn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_train.pt
new file mode 100644
index 000000000..29fdfe0b9
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_train.pt
@@ -0,0 +1,286 @@
+name: "VGG_CNN_M_1024"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7 stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3 stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  convolution_param {
+    num_output: 256
+    pad: 1 kernel_size: 5 stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3 stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: 'rpn_bbox_inside_weights'
+  bottom: 'rpn_bbox_outside_weights'
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "dummy_roi_pool_conv5"
+  type: "DummyData"
+  top: "dummy_roi_pool_conv5"
+  dummy_data_param {
+    shape { dim: 1 dim: 18432 }
+    data_filler { type: "gaussian" std: 0.01 }
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "dummy_roi_pool_conv5"
+  top: "fc6"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "silence_fc7"
+  type: "Silence"
+  bottom: "fc7"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt
new file mode 100644
index 000000000..a45a6eeca
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 30000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt
new file mode 100644
index 000000000..4825b1b12
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt
@@ -0,0 +1,337 @@
+name: "VGG_CNN_M_1024"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7 stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3 stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    pad: 1 kernel_size: 5 stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3 stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param { lr_mult: 1 }
+  param { lr_mult: 2 }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
+
+#========= RPN ============
+# Dummy layers so that initial parameters are saved into the output net
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "silence_rpn_cls_score"
+  type: "Silence"
+  bottom: "rpn_cls_score"
+}
+layer {
+  name: "silence_rpn_bbox_pred"
+  type: "Silence"
+  bottom: "rpn_bbox_pred"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt
new file mode 100644
index 000000000..85f4f0cfb
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 60000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg_cnn_m_1024_rpn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_train.pt
new file mode 100644
index 000000000..d27e76b38
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_train.pt
@@ -0,0 +1,286 @@
+name: "VGG_CNN_M_1024"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7 stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3 stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    pad: 1 kernel_size: 5 stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3 stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 512
+    pad: 1 kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: 'rpn_bbox_inside_weights'
+  bottom: 'rpn_bbox_outside_weights'
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "dummy_roi_pool_conv5"
+  type: "DummyData"
+  top: "dummy_roi_pool_conv5"
+  dummy_data_param {
+    shape { dim: 1 dim: 18432 }
+    data_filler { type: "gaussian" std: 0.01 }
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "dummy_roi_pool_conv5"
+  top: "fc6"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "silence_fc7"
+  type: "Silence"
+  bottom: "fc7"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/solver.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/solver.prototxt
new file mode 100644
index 000000000..7b588525d
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/solver.prototxt
@@ -0,0 +1,14 @@
+train_net: "../../models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt"
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 50000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "vgg_cnn_m_1024_faster_rcnn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/test.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/test.prototxt
new file mode 100644
index 000000000..c8bc90ab0
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/test.prototxt
@@ -0,0 +1,450 @@
+name: "VGG_CNN_M_1024"
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 5
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+#layer {
+#  name: "rpn_conv/3x3"
+#  type: "Convolution"
+#  bottom: "conv5"
+#  top: "rpn_conv/3x3"
+#  param { lr_mult: 1.0 decay_mult: 1.0 }
+#  param { lr_mult: 2.0 decay_mult: 0 }
+#  convolution_param {
+#    num_output: 192
+#    kernel_size: 3 pad: 1 stride: 1
+#    weight_filler { type: "gaussian" std: 0.01 }
+#    bias_filler { type: "constant" value: 0 }
+#  }
+#}
+#layer {
+#  name: "rpn_conv/5x5"
+#  type: "Convolution"
+#  bottom: "conv5"
+#  top: "rpn_conv/5x5"
+#  param { lr_mult: 1.0 decay_mult: 1.0 }
+#  param { lr_mult: 2.0 decay_mult: 0 }
+#  convolution_param {
+#    num_output: 64
+#    kernel_size: 5 pad: 2 stride: 1
+#    weight_filler { type: "gaussian" std: 0.0036 }
+#    bias_filler { type: "constant" value: 0 }
+#  }
+#}
+#layer {
+#  name: "rpn/output"
+#  type: "Concat"
+#  bottom: "rpn_conv/3x3"
+#  bottom: "rpn_conv/5x5"
+#  top: "rpn/output"
+#}
+#layer {
+#  name: "rpn_relu/output"
+#  type: "ReLU"
+#  bottom: "rpn/output"
+#  top: "rpn/output"
+#}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 decay_mult: 1.0 }
+  param { lr_mult: 2.0 decay_mult: 0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+    decay_mult: 1
+  }
+  param {
+    lr_mult: 2
+    decay_mult: 0
+  }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt
new file mode 100644
index 000000000..81a4d3e98
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt
@@ -0,0 +1,484 @@
+name: "VGG_CNN_M_1024"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  param {
+    lr_mult: 0
+    decay_mult: 0
+  }
+  convolution_param {
+    num_output: 96
+    kernel_size: 7
+    stride: 2
+  }
+}
+layer {
+  name: "relu1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "norm1"
+  type: "LRN"
+  bottom: "conv1"
+  top: "norm1"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "norm1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv2"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "conv2"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 5
+    stride: 2
+  }
+}
+layer {
+  name: "relu2"
+  type: "ReLU"
+  bottom: "conv2"
+  top: "conv2"
+}
+layer {
+  name: "norm2"
+  type: "LRN"
+  bottom: "conv2"
+  top: "norm2"
+  lrn_param {
+    local_size: 5
+    alpha: 0.0005
+    beta: 0.75
+    k: 2
+  }
+}
+layer {
+  name: "pool2"
+  type: "Pooling"
+  bottom: "norm2"
+  top: "pool2"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "conv3"
+  type: "Convolution"
+  bottom: "pool2"
+  top: "conv3"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu3"
+  type: "ReLU"
+  bottom: "conv3"
+  top: "conv3"
+}
+layer {
+  name: "conv4"
+  type: "Convolution"
+  bottom: "conv3"
+  top: "conv4"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu4"
+  type: "ReLU"
+  bottom: "conv4"
+  top: "conv4"
+}
+layer {
+  name: "conv5"
+  type: "Convolution"
+  bottom: "conv4"
+  top: "conv5"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  convolution_param {
+    num_output: 512
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "relu5"
+  type: "ReLU"
+  bottom: "conv5"
+  top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: 'rpn_bbox_inside_weights'
+  bottom: 'rpn_bbox_outside_weights'
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rpn_rois'
+#  top: 'rpn_scores'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+#layer {
+#  name: 'debug-data'
+#  type: 'Python'
+#  bottom: 'data'
+#  bottom: 'rpn_rois'
+#  bottom: 'rpn_scores'
+#  python_param {
+#    module: 'rpn.debug_layer'
+#    layer: 'RPNDebugLayer'
+#  }
+#}
+
+layer {
+  name: 'roi-data'
+  type: 'Python'
+  bottom: 'rpn_rois'
+  bottom: 'gt_boxes'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'rpn.proposal_target_layer'
+    layer: 'ProposalTargetLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "pool5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "pool5"
+  top: "fc6"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 1024
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param {
+    lr_mult: 1
+  }
+  param {
+    lr_mult: 2
+  }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "loss_cls"
+  loss_weight: 1
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "loss_bbox"
+  loss_weight: 1
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/fast_rcnn/solver.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/fast_rcnn/solver.prototxt
new file mode 100644
index 000000000..a7db0f767
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/fast_rcnn/solver.prototxt
@@ -0,0 +1,18 @@
+train_net: "../../models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/fast_rcnn/train.prototxt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 30000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "zf_fast_rcnn"
+#debug_info: true
+#iter_size: 2
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/fast_rcnn/test.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/fast_rcnn/test.prototxt
new file mode 100644
index 000000000..5cdfefa15
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/fast_rcnn/test.prototxt
@@ -0,0 +1,249 @@
+name: "ZF"
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+
+input: "rois"
+input_shape {
+  dim: 1 # to be changed on-the-fly to num ROIs
+  dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing
+}
+
+#========= conv1-conv5 ============
+
+layer {
+	name: "conv1"
+	type: "Convolution"
+	bottom: "data"
+	top: "conv1"
+	convolution_param {
+		num_output: 96
+		kernel_size: 7
+		pad: 3
+		stride: 2
+	}
+}
+layer {
+	name: "relu1"
+	type: "ReLU"
+	bottom: "conv1"
+	top: "conv1"
+}
+layer {
+	name: "norm1"
+	type: "LRN"
+	bottom: "conv1"
+	top: "norm1"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool1"
+	type: "Pooling"
+	bottom: "norm1"
+	top: "pool1"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv2"
+	type: "Convolution"
+	bottom: "pool1"
+	top: "conv2"
+	convolution_param {
+		num_output: 256
+		kernel_size: 5
+		pad: 2
+		stride: 2
+	}
+}
+layer {
+	name: "relu2"
+	type: "ReLU"
+	bottom: "conv2"
+	top: "conv2"
+}
+layer {
+	name: "norm2"
+	type: "LRN"
+	bottom: "conv2"
+	top: "norm2"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool2"
+	type: "Pooling"
+	bottom: "norm2"
+	top: "pool2"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv3"
+	type: "Convolution"
+	bottom: "pool2"
+	top: "conv3"
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu3"
+	type: "ReLU"
+	bottom: "conv3"
+	top: "conv3"
+}
+layer {
+	name: "conv4"
+	type: "Convolution"
+	bottom: "conv3"
+	top: "conv4"
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu4"
+	type: "ReLU"
+	bottom: "conv4"
+	top: "conv4"
+}
+layer {
+	name: "conv5"
+	type: "Convolution"
+	bottom: "conv4"
+	top: "conv5"
+	convolution_param {
+		num_output: 256
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu5"
+	type: "ReLU"
+	bottom: "conv5"
+	top: "conv5"
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool_conv5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "roi_pool_conv5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "roi_pool_conv5"
+  top: "fc6"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  inner_product_param {
+    num_output: 21
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  inner_product_param {
+    num_output: 84
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/fast_rcnn/train.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/fast_rcnn/train.prototxt
new file mode 100644
index 000000000..72cc57e26
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/fast_rcnn/train.prototxt
@@ -0,0 +1,298 @@
+name: "ZF"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+
+#========= conv1-conv5 ============
+
+layer {
+	name: "conv1"
+	type: "Convolution"
+	bottom: "data"
+	top: "conv1"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 96
+		kernel_size: 7
+		pad: 3
+		stride: 2
+	}
+}
+layer {
+	name: "relu1"
+	type: "ReLU"
+	bottom: "conv1"
+	top: "conv1"
+}
+layer {
+	name: "norm1"
+	type: "LRN"
+	bottom: "conv1"
+	top: "norm1"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool1"
+	type: "Pooling"
+	bottom: "norm1"
+	top: "pool1"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv2"
+	type: "Convolution"
+	bottom: "pool1"
+	top: "conv2"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 5
+		pad: 2
+		stride: 2
+	}
+}
+layer {
+	name: "relu2"
+	type: "ReLU"
+	bottom: "conv2"
+	top: "conv2"
+}
+layer {
+	name: "norm2"
+	type: "LRN"
+	bottom: "conv2"
+	top: "norm2"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool2"
+	type: "Pooling"
+	bottom: "norm2"
+	top: "pool2"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv3"
+	type: "Convolution"
+	bottom: "pool2"
+	top: "conv3"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu3"
+	type: "ReLU"
+	bottom: "conv3"
+	top: "conv3"
+}
+layer {
+	name: "conv4"
+	type: "Convolution"
+	bottom: "conv3"
+	top: "conv4"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu4"
+	type: "ReLU"
+	bottom: "conv4"
+	top: "conv4"
+}
+layer {
+	name: "conv5"
+	type: "Convolution"
+	bottom: "conv4"
+	top: "conv5"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu5"
+	type: "ReLU"
+	bottom: "conv5"
+	top: "conv5"
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool_conv5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "roi_pool_conv5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "roi_pool_conv5"
+  top: "fc6"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "bbox_loss"
+  loss_weight: 1
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt
new file mode 100644
index 000000000..6fa378b00
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt
@@ -0,0 +1,325 @@
+name: "ZF"
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+
+#========= conv1-conv5 ============
+
+layer {
+	name: "conv1"
+	type: "Convolution"
+	bottom: "data"
+	top: "conv1"
+	convolution_param {
+		num_output: 96
+		kernel_size: 7
+		pad: 3
+		stride: 2
+	}
+}
+layer {
+	name: "relu1"
+	type: "ReLU"
+	bottom: "conv1"
+	top: "conv1"
+}
+layer {
+	name: "norm1"
+	type: "LRN"
+	bottom: "conv1"
+	top: "norm1"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool1"
+	type: "Pooling"
+	bottom: "norm1"
+	top: "pool1"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv2"
+	type: "Convolution"
+	bottom: "pool1"
+	top: "conv2"
+	convolution_param {
+		num_output: 256
+		kernel_size: 5
+		pad: 2
+		stride: 2
+	}
+}
+layer {
+	name: "relu2"
+	type: "ReLU"
+	bottom: "conv2"
+	top: "conv2"
+}
+layer {
+	name: "norm2"
+	type: "LRN"
+	bottom: "conv2"
+	top: "norm2"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool2"
+	type: "Pooling"
+	bottom: "norm2"
+	top: "pool2"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv3"
+	type: "Convolution"
+	bottom: "pool2"
+	top: "conv3"
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu3"
+	type: "ReLU"
+	bottom: "conv3"
+	top: "conv3"
+}
+layer {
+	name: "conv4"
+	type: "Convolution"
+	bottom: "conv3"
+	top: "conv4"
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu4"
+	type: "ReLU"
+	bottom: "conv4"
+	top: "conv4"
+}
+layer {
+	name: "conv5"
+	type: "Convolution"
+	bottom: "conv4"
+	top: "conv5"
+	convolution_param {
+		num_output: 256
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu5"
+	type: "ReLU"
+	bottom: "conv5"
+	top: "conv5"
+}
+
+#========= RPN ============
+
+
+layer {
+  name: "rpn_conv1"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn_conv1"
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+  }
+}
+layer {
+  name: "rpn_relu1"
+  type: "ReLU"
+  bottom: "rpn_conv1"
+  top: "rpn_conv1"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_cls_score"
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_bbox_pred"
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool_conv5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "roi_pool_conv5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "roi_pool_conv5"
+  top: "fc6"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  inner_product_param {
+    num_output: 21
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  inner_product_param {
+    num_output: 84
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/rpn_test.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/rpn_test.pt
new file mode 100644
index 000000000..204f08fac
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/rpn_test.pt
@@ -0,0 +1,233 @@
+name: "ZF"
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+
+# ------------------------ layer 1 -----------------------------
+layer {
+	name: "conv1"
+	type: "Convolution"
+	bottom: "data"
+	top: "conv1"
+	convolution_param {
+		num_output: 96
+		kernel_size: 7
+		pad: 3
+		stride: 2
+	}
+}
+layer {
+	name: "relu1"
+	type: "ReLU"
+	bottom: "conv1"
+	top: "conv1"
+}
+layer {
+	name: "norm1"
+	type: "LRN"
+	bottom: "conv1"
+	top: "norm1"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool1"
+	type: "Pooling"
+	bottom: "norm1"
+	top: "pool1"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv2"
+	type: "Convolution"
+	bottom: "pool1"
+	top: "conv2"
+	convolution_param {
+		num_output: 256
+		kernel_size: 5
+		pad: 2
+		stride: 2
+	}
+}
+layer {
+	name: "relu2"
+	type: "ReLU"
+	bottom: "conv2"
+	top: "conv2"
+}
+
+layer {
+	name: "norm2"
+	type: "LRN"
+	bottom: "conv2"
+	top: "norm2"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool2"
+	type: "Pooling"
+	bottom: "norm2"
+	top: "pool2"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv3"
+	type: "Convolution"
+	bottom: "pool2"
+	top: "conv3"
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu3"
+	type: "ReLU"
+	bottom: "conv3"
+	top: "conv3"
+}
+layer {
+	name: "conv4"
+	type: "Convolution"
+	bottom: "conv3"
+	top: "conv4"
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu4"
+	type: "ReLU"
+	bottom: "conv4"
+	top: "conv4"
+}
+layer {
+	name: "conv5"
+	type: "Convolution"
+	bottom: "conv4"
+	top: "conv5"
+	convolution_param {
+		num_output: 256
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu5"
+	type: "ReLU"
+	bottom: "conv5"
+	top: "conv5"
+}
+
+#-----------------------layer +-------------------------
+
+layer {
+  name: "rpn_conv1"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn_conv1"
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+  }
+}
+layer {
+  name: "rpn_relu1"
+  type: "ReLU"
+  bottom: "rpn_conv1"
+  top: "rpn_conv1"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_cls_score"
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_bbox_pred"
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#-----------------------output------------------------
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  top: 'scores'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt
new file mode 100644
index 000000000..0180e7c75
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 30000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "zf_fast_rcnn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt
new file mode 100644
index 000000000..0fc9b3cf7
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt
@@ -0,0 +1,360 @@
+name: "ZF"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+
+#========= conv1-conv5 ============
+
+layer {
+	name: "conv1"
+	type: "Convolution"
+	bottom: "data"
+	top: "conv1"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 96
+		kernel_size: 7
+		pad: 3
+		stride: 2
+	}
+}
+layer {
+	name: "relu1"
+	type: "ReLU"
+	bottom: "conv1"
+	top: "conv1"
+}
+layer {
+	name: "norm1"
+	type: "LRN"
+	bottom: "conv1"
+	top: "norm1"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool1"
+	type: "Pooling"
+	bottom: "norm1"
+	top: "pool1"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv2"
+	type: "Convolution"
+	bottom: "pool1"
+	top: "conv2"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 5
+		pad: 2
+		stride: 2
+	}
+}
+layer {
+	name: "relu2"
+	type: "ReLU"
+	bottom: "conv2"
+	top: "conv2"
+}
+layer {
+	name: "norm2"
+	type: "LRN"
+	bottom: "conv2"
+	top: "norm2"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool2"
+	type: "Pooling"
+	bottom: "norm2"
+	top: "pool2"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv3"
+	type: "Convolution"
+	bottom: "pool2"
+	top: "conv3"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu3"
+	type: "ReLU"
+	bottom: "conv3"
+	top: "conv3"
+}
+layer {
+	name: "conv4"
+	type: "Convolution"
+	bottom: "conv3"
+	top: "conv4"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu4"
+	type: "ReLU"
+	bottom: "conv4"
+	top: "conv4"
+}
+layer {
+	name: "conv5"
+	type: "Convolution"
+	bottom: "conv4"
+	top: "conv5"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu5"
+	type: "ReLU"
+	bottom: "conv5"
+	top: "conv5"
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool_conv5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "roi_pool_conv5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "roi_pool_conv5"
+  top: "fc6"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "bbox_loss"
+  loss_weight: 1
+}
+
+#========= RPN ============
+# Dummy layers so that initial parameters are saved into the output net
+
+layer {
+  name: "rpn_conv1"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn_conv1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu1"
+  type: "ReLU"
+  bottom: "rpn_conv1"
+  top: "rpn_conv1"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_cls_score"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "silence_rpn_cls_score"
+  type: "Silence"
+  bottom: "rpn_cls_score"
+}
+layer {
+  name: "silence_rpn_bbox_pred"
+  type: "Silence"
+  bottom: "rpn_bbox_pred"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt
new file mode 100644
index 000000000..23a7c6a14
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 60000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "zf_rpn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt
new file mode 100644
index 000000000..adf860578
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt
@@ -0,0 +1,312 @@
+name: "ZF"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+
+#========= conv1-conv5 ============
+
+layer {
+	name: "conv1"
+	type: "Convolution"
+	bottom: "data"
+	top: "conv1"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 96
+		kernel_size: 7
+		pad: 3
+		stride: 2
+	}
+}
+layer {
+	name: "relu1"
+	type: "ReLU"
+	bottom: "conv1"
+	top: "conv1"
+}
+layer {
+	name: "norm1"
+	type: "LRN"
+	bottom: "conv1"
+	top: "norm1"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool1"
+	type: "Pooling"
+	bottom: "norm1"
+	top: "pool1"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv2"
+	type: "Convolution"
+	bottom: "pool1"
+	top: "conv2"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 5
+		pad: 2
+		stride: 2
+	}
+}
+layer {
+	name: "relu2"
+	type: "ReLU"
+	bottom: "conv2"
+	top: "conv2"
+}
+layer {
+	name: "norm2"
+	type: "LRN"
+	bottom: "conv2"
+	top: "norm2"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool2"
+	type: "Pooling"
+	bottom: "norm2"
+	top: "pool2"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv3"
+	type: "Convolution"
+	bottom: "pool2"
+	top: "conv3"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu3"
+	type: "ReLU"
+	bottom: "conv3"
+	top: "conv3"
+}
+layer {
+	name: "conv4"
+	type: "Convolution"
+	bottom: "conv3"
+	top: "conv4"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu4"
+	type: "ReLU"
+	bottom: "conv4"
+	top: "conv4"
+}
+layer {
+	name: "conv5"
+	type: "Convolution"
+	bottom: "conv4"
+	top: "conv5"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu5"
+	type: "ReLU"
+	bottom: "conv5"
+	top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv1"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn_conv1"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu1"
+  type: "ReLU"
+  bottom: "rpn_conv1"
+  top: "rpn_conv1"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: "rpn_bbox_inside_weights"
+  bottom: "rpn_bbox_outside_weights"
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RCNN ============
+# Dummy layers so that initial parameters are saved into the output net
+
+layer {
+  name: "dummy_roi_pool_conv5"
+  type: "DummyData"
+  top: "dummy_roi_pool_conv5"
+  dummy_data_param {
+    shape { dim: 1 dim: 9216 }
+    data_filler { type: "gaussian" std: 0.01 }
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "dummy_roi_pool_conv5"
+  top: "fc6"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "silence_fc7"
+  type: "Silence"
+  bottom: "fc7"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt
new file mode 100644
index 000000000..a666def37
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 30000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "zf_fast_rcnn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt
new file mode 100644
index 000000000..e08f54c12
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt
@@ -0,0 +1,360 @@
+name: "ZF"
+layer {
+  name: 'data'
+  type: 'Python'
+  top: 'data'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+
+#========= conv1-conv5 ============
+
+layer {
+	name: "conv1"
+	type: "Convolution"
+	bottom: "data"
+	top: "conv1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+	convolution_param {
+		num_output: 96
+		kernel_size: 7
+		pad: 3
+		stride: 2
+	}
+}
+layer {
+	name: "relu1"
+	type: "ReLU"
+	bottom: "conv1"
+	top: "conv1"
+}
+layer {
+	name: "norm1"
+	type: "LRN"
+	bottom: "conv1"
+	top: "norm1"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool1"
+	type: "Pooling"
+	bottom: "norm1"
+	top: "pool1"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv2"
+	type: "Convolution"
+	bottom: "pool1"
+	top: "conv2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 5
+		pad: 2
+		stride: 2
+	}
+}
+layer {
+	name: "relu2"
+	type: "ReLU"
+	bottom: "conv2"
+	top: "conv2"
+}
+layer {
+	name: "norm2"
+	type: "LRN"
+	bottom: "conv2"
+	top: "norm2"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool2"
+	type: "Pooling"
+	bottom: "norm2"
+	top: "pool2"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv3"
+	type: "Convolution"
+	bottom: "pool2"
+	top: "conv3"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu3"
+	type: "ReLU"
+	bottom: "conv3"
+	top: "conv3"
+}
+layer {
+	name: "conv4"
+	type: "Convolution"
+	bottom: "conv3"
+	top: "conv4"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu4"
+	type: "ReLU"
+	bottom: "conv4"
+	top: "conv4"
+}
+layer {
+	name: "conv5"
+	type: "Convolution"
+	bottom: "conv4"
+	top: "conv5"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu5"
+	type: "ReLU"
+	bottom: "conv5"
+	top: "conv5"
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool_conv5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "roi_pool_conv5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "roi_pool_conv5"
+  top: "fc6"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: "bbox_inside_weights"
+  bottom: "bbox_outside_weights"
+  top: "bbox_loss"
+  loss_weight: 1
+}
+
+#========= RPN ============
+# Dummy layers so that initial parameters are saved into the output net
+
+layer {
+  name: "rpn_conv1"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn_conv1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu1"
+  type: "ReLU"
+  bottom: "rpn_conv1"
+  top: "rpn_conv1"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_cls_score"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "silence_rpn_cls_score"
+  type: "Silence"
+  bottom: "rpn_cls_score"
+}
+layer {
+  name: "silence_rpn_bbox_pred"
+  type: "Silence"
+  bottom: "rpn_bbox_pred"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt
new file mode 100644
index 000000000..15d3da763
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt
@@ -0,0 +1,16 @@
+train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_train.pt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 60000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "zf_rpn"
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_train.pt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_train.pt
new file mode 100644
index 000000000..336b05b31
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_train.pt
@@ -0,0 +1,312 @@
+name: "ZF"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+
+#========= conv1-conv5 ============
+
+layer {
+	name: "conv1"
+	type: "Convolution"
+	bottom: "data"
+	top: "conv1"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+	convolution_param {
+		num_output: 96
+		kernel_size: 7
+		pad: 3
+		stride: 2
+	}
+}
+layer {
+	name: "relu1"
+	type: "ReLU"
+	bottom: "conv1"
+	top: "conv1"
+}
+layer {
+	name: "norm1"
+	type: "LRN"
+	bottom: "conv1"
+	top: "norm1"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool1"
+	type: "Pooling"
+	bottom: "norm1"
+	top: "pool1"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv2"
+	type: "Convolution"
+	bottom: "pool1"
+	top: "conv2"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 5
+		pad: 2
+		stride: 2
+	}
+}
+layer {
+	name: "relu2"
+	type: "ReLU"
+	bottom: "conv2"
+	top: "conv2"
+}
+layer {
+	name: "norm2"
+	type: "LRN"
+	bottom: "conv2"
+	top: "norm2"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool2"
+	type: "Pooling"
+	bottom: "norm2"
+	top: "pool2"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv3"
+	type: "Convolution"
+	bottom: "pool2"
+	top: "conv3"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu3"
+	type: "ReLU"
+	bottom: "conv3"
+	top: "conv3"
+}
+layer {
+	name: "conv4"
+	type: "Convolution"
+	bottom: "conv3"
+	top: "conv4"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu4"
+	type: "ReLU"
+	bottom: "conv4"
+	top: "conv4"
+}
+layer {
+	name: "conv5"
+	type: "Convolution"
+	bottom: "conv4"
+	top: "conv5"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu5"
+	type: "ReLU"
+	bottom: "conv5"
+	top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv1"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn_conv1"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu1"
+  type: "ReLU"
+  bottom: "rpn_conv1"
+  top: "rpn_conv1"
+}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn_conv1"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: "rpn_bbox_inside_weights"
+  bottom: "rpn_bbox_outside_weights"
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RCNN ============
+# Dummy layers so that initial parameters are saved into the output net
+
+layer {
+  name: "dummy_roi_pool_conv5"
+  type: "DummyData"
+  top: "dummy_roi_pool_conv5"
+  dummy_data_param {
+    shape { dim: 1 dim: 9216 }
+    data_filler { type: "gaussian" std: 0.01 }
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "dummy_roi_pool_conv5"
+  top: "fc6"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 0 decay_mult: 0 }
+  param { lr_mult: 0 decay_mult: 0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "silence_fc7"
+  type: "Silence"
+  bottom: "fc7"
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_end2end/solver.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_end2end/solver.prototxt
new file mode 100644
index 000000000..2012560e9
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_end2end/solver.prototxt
@@ -0,0 +1,25 @@
+train_net: "../../models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_end2end/train.prototxt"
+
+base_lr: 0.001
+lr_policy: "step"
+gamma: 0.1
+stepsize: 50000
+display: 20
+average_loss: 100
+momentum: 0.9
+weight_decay: 0.0005
+
+#base_lr: 0.001
+#lr_policy: "exp"
+#gamma: 0.999539589  # (0.00001/0.001)^(1/10000)
+#display: 1
+#average_loss: 100
+#momentum: 0.9
+#weight_decay: 0.0005
+
+# We disable standard caffe solver snapshotting and implement our own snapshot
+# function
+snapshot: 0
+# We still use the snapshot prefix, though
+snapshot_prefix: "zf_faster_rcnn"
+iter_size: 2
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_end2end/test.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_end2end/test.prototxt
new file mode 100644
index 000000000..efdc7c69a
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_end2end/test.prototxt
@@ -0,0 +1,371 @@
+name: "ZF"
+
+input: "data"
+input_shape {
+  dim: 1
+  dim: 3
+  dim: 224
+  dim: 224
+}
+
+input: "im_info"
+input_shape {
+  dim: 1
+  dim: 3
+}
+
+#========= conv1-conv5 ============
+
+layer {
+	name: "conv1"
+	type: "Convolution"
+	bottom: "data"
+	top: "conv1"
+	convolution_param {
+		num_output: 96
+		kernel_size: 7
+		pad: 3
+		stride: 2
+	}
+}
+layer {
+	name: "relu1"
+	type: "ReLU"
+	bottom: "conv1"
+	top: "conv1"
+}
+layer {
+	name: "norm1"
+	type: "LRN"
+	bottom: "conv1"
+	top: "norm1"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool1"
+	type: "Pooling"
+	bottom: "norm1"
+	top: "pool1"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv2"
+	type: "Convolution"
+	bottom: "pool1"
+	top: "conv2"
+	convolution_param {
+		num_output: 256
+		kernel_size: 5
+		pad: 2
+		stride: 2
+	}
+}
+layer {
+	name: "relu2"
+	type: "ReLU"
+	bottom: "conv2"
+	top: "conv2"
+}
+layer {
+	name: "norm2"
+	type: "LRN"
+	bottom: "conv2"
+	top: "norm2"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool2"
+	type: "Pooling"
+	bottom: "norm2"
+	top: "pool2"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv3"
+	type: "Convolution"
+	bottom: "pool2"
+	top: "conv3"
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu3"
+	type: "ReLU"
+	bottom: "conv3"
+	top: "conv3"
+}
+layer {
+	name: "conv4"
+	type: "Convolution"
+	bottom: "conv3"
+	top: "conv4"
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu4"
+	type: "ReLU"
+	bottom: "conv4"
+	top: "conv4"
+}
+layer {
+	name: "conv5"
+	type: "Convolution"
+	bottom: "conv4"
+	top: "conv5"
+	convolution_param {
+		num_output: 256
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu5"
+	type: "ReLU"
+	bottom: "conv5"
+	top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+#layer {
+#  name: "rpn_conv/3x3"
+#  type: "Convolution"
+#  bottom: "conv5"
+#  top: "rpn_conv/3x3"
+#  param { lr_mult: 1.0 decay_mult: 1.0 }
+#  param { lr_mult: 2.0 decay_mult: 0 }
+#  convolution_param {
+#    num_output: 192
+#    kernel_size: 3 pad: 1 stride: 1
+#    weight_filler { type: "gaussian" std: 0.01 }
+#    bias_filler { type: "constant" value: 0 }
+#  }
+#}
+#layer {
+#  name: "rpn_conv/5x5"
+#  type: "Convolution"
+#  bottom: "conv5"
+#  top: "rpn_conv/5x5"
+#  param { lr_mult: 1.0 decay_mult: 1.0 }
+#  param { lr_mult: 2.0 decay_mult: 0 }
+#  convolution_param {
+#    num_output: 64
+#    kernel_size: 5 pad: 2 stride: 1
+#    weight_filler { type: "gaussian" std: 0.0036 }
+#    bias_filler { type: "constant" value: 0 }
+#  }
+#}
+#layer {
+#  name: "rpn/output"
+#  type: "Concat"
+#  bottom: "rpn_conv/3x3"
+#  bottom: "rpn_conv/5x5"
+#  top: "rpn/output"
+#}
+#layer {
+#  name: "rpn_relu/output"
+#  type: "ReLU"
+#  bottom: "rpn/output"
+#  top: "rpn/output"
+#}
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rois'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool_conv5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "roi_pool_conv5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "roi_pool_conv5"
+  top: "fc6"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  inner_product_param {
+    num_output: 21
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  inner_product_param {
+    num_output: 84
+  }
+}
+layer {
+  name: "cls_prob"
+  type: "Softmax"
+  bottom: "cls_score"
+  top: "cls_prob"
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
diff --git a/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_end2end/train.prototxt b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_end2end/train.prototxt
new file mode 100644
index 000000000..936eb0f8f
--- /dev/null
+++ b/models/intel_optimized_models/faster-rcnn/pascal_voc/ZF/faster_rcnn_end2end/train.prototxt
@@ -0,0 +1,495 @@
+name: "ZF"
+layer {
+  name: 'input-data'
+  type: 'Python'
+  top: 'data'
+  top: 'im_info'
+  top: 'gt_boxes'
+  python_param {
+    module: 'roi_data_layer.layer'
+    layer: 'RoIDataLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+
+#========= conv1-conv5 ============
+
+layer {
+	name: "conv1"
+	type: "Convolution"
+	bottom: "data"
+	top: "conv1"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 96
+		kernel_size: 7
+		pad: 3
+		stride: 2
+	}
+}
+layer {
+	name: "relu1"
+	type: "ReLU"
+	bottom: "conv1"
+	top: "conv1"
+}
+layer {
+	name: "norm1"
+	type: "LRN"
+	bottom: "conv1"
+	top: "norm1"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool1"
+	type: "Pooling"
+	bottom: "norm1"
+	top: "pool1"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv2"
+	type: "Convolution"
+	bottom: "pool1"
+	top: "conv2"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 5
+		pad: 2
+		stride: 2
+	}
+}
+layer {
+	name: "relu2"
+	type: "ReLU"
+	bottom: "conv2"
+	top: "conv2"
+}
+layer {
+	name: "norm2"
+	type: "LRN"
+	bottom: "conv2"
+	top: "norm2"
+	lrn_param {
+		local_size: 3
+		alpha: 0.00005
+		beta: 0.75
+		norm_region: WITHIN_CHANNEL
+    engine: CAFFE
+	}
+}
+layer {
+	name: "pool2"
+	type: "Pooling"
+	bottom: "norm2"
+	top: "pool2"
+	pooling_param {
+		kernel_size: 3
+		stride: 2
+		pad: 1
+		pool: MAX
+	}
+}
+layer {
+	name: "conv3"
+	type: "Convolution"
+	bottom: "pool2"
+	top: "conv3"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu3"
+	type: "ReLU"
+	bottom: "conv3"
+	top: "conv3"
+}
+layer {
+	name: "conv4"
+	type: "Convolution"
+	bottom: "conv3"
+	top: "conv4"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 384
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu4"
+	type: "ReLU"
+	bottom: "conv4"
+	top: "conv4"
+}
+layer {
+	name: "conv5"
+	type: "Convolution"
+	bottom: "conv4"
+	top: "conv5"
+	param { lr_mult: 1.0 }
+	param { lr_mult: 2.0 }
+	convolution_param {
+		num_output: 256
+		kernel_size: 3
+		pad: 1
+		stride: 1
+	}
+}
+layer {
+	name: "relu5"
+	type: "ReLU"
+	bottom: "conv5"
+	top: "conv5"
+}
+
+#========= RPN ============
+
+layer {
+  name: "rpn_conv/3x3"
+  type: "Convolution"
+  bottom: "conv5"
+  top: "rpn/output"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 256
+    kernel_size: 3 pad: 1 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_relu/3x3"
+  type: "ReLU"
+  bottom: "rpn/output"
+  top: "rpn/output"
+}
+
+#layer {
+#  name: "rpn_conv/3x3"
+#  type: "Convolution"
+#  bottom: "conv5"
+#  top: "rpn_conv/3x3"
+#  param { lr_mult: 1.0 }
+#  param { lr_mult: 2.0 }
+#  convolution_param {
+#    num_output: 192
+#    kernel_size: 3 pad: 1 stride: 1
+#    weight_filler { type: "gaussian" std: 0.01 }
+#    bias_filler { type: "constant" value: 0 }
+#  }
+#}
+#layer {
+#  name: "rpn_conv/5x5"
+#  type: "Convolution"
+#  bottom: "conv5"
+#  top: "rpn_conv/5x5"
+#  param { lr_mult: 1.0 }
+#  param { lr_mult: 2.0 }
+#  convolution_param {
+#    num_output: 64
+#    kernel_size: 5 pad: 2 stride: 1
+#    weight_filler { type: "gaussian" std: 0.0036 }
+#    bias_filler { type: "constant" value: 0 }
+#  }
+#}
+#layer {
+#  name: "rpn/output"
+#  type: "Concat"
+#  bottom: "rpn_conv/3x3"
+#  bottom: "rpn_conv/5x5"
+#  top: "rpn/output"
+#}
+#layer {
+#  name: "rpn_relu/output"
+#  type: "ReLU"
+#  bottom: "rpn/output"
+#  top: "rpn/output"
+#}
+
+layer {
+  name: "rpn_cls_score"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 18   # 2(bg/fg) * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+  name: "rpn_bbox_pred"
+  type: "Convolution"
+  bottom: "rpn/output"
+  top: "rpn_bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  convolution_param {
+    num_output: 36   # 4 * 9(anchors)
+    kernel_size: 1 pad: 0 stride: 1
+    weight_filler { type: "gaussian" std: 0.01 }
+    bias_filler { type: "constant" value: 0 }
+  }
+}
+layer {
+   bottom: "rpn_cls_score"
+   top: "rpn_cls_score_reshape"
+   name: "rpn_cls_score_reshape"
+   type: "Reshape"
+   reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'rpn-data'
+  type: 'Python'
+  bottom: 'rpn_cls_score'
+  bottom: 'gt_boxes'
+  bottom: 'im_info'
+  bottom: 'data'
+  top: 'rpn_labels'
+  top: 'rpn_bbox_targets'
+  top: 'rpn_bbox_inside_weights'
+  top: 'rpn_bbox_outside_weights'
+  python_param {
+    module: 'rpn.anchor_target_layer'
+    layer: 'AnchorTargetLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+layer {
+  name: "rpn_loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "rpn_cls_score_reshape"
+  bottom: "rpn_labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "rpn_cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+layer {
+  name: "rpn_loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "rpn_bbox_pred"
+  bottom: "rpn_bbox_targets"
+  bottom: 'rpn_bbox_inside_weights'
+  bottom: 'rpn_bbox_outside_weights'
+  top: "rpn_loss_bbox"
+  loss_weight: 1
+  smooth_l1_loss_param { sigma: 3.0 }
+}
+
+#========= RoI Proposal ============
+
+layer {
+  name: "rpn_cls_prob"
+  type: "Softmax"
+  bottom: "rpn_cls_score_reshape"
+  top: "rpn_cls_prob"
+}
+layer {
+  name: 'rpn_cls_prob_reshape'
+  type: 'Reshape'
+  bottom: 'rpn_cls_prob'
+  top: 'rpn_cls_prob_reshape'
+  reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
+}
+layer {
+  name: 'proposal'
+  type: 'Python'
+  bottom: 'rpn_cls_prob_reshape'
+  bottom: 'rpn_bbox_pred'
+  bottom: 'im_info'
+  top: 'rpn_rois'
+#  top: 'rpn_scores'
+  python_param {
+    module: 'rpn.proposal_layer'
+    layer: 'ProposalLayer'
+    param_str: "'feat_stride': 16"
+  }
+}
+#layer {
+#  name: 'debug-data'
+#  type: 'Python'
+#  bottom: 'data'
+#  bottom: 'rpn_rois'
+#  bottom: 'rpn_scores'
+#  python_param {
+#    module: 'rpn.debug_layer'
+#    layer: 'RPNDebugLayer'
+#  }
+#}
+layer {
+  name: 'roi-data'
+  type: 'Python'
+  bottom: 'rpn_rois'
+  bottom: 'gt_boxes'
+  top: 'rois'
+  top: 'labels'
+  top: 'bbox_targets'
+  top: 'bbox_inside_weights'
+  top: 'bbox_outside_weights'
+  python_param {
+    module: 'rpn.proposal_target_layer'
+    layer: 'ProposalTargetLayer'
+    param_str: "'num_classes': 21"
+  }
+}
+
+#========= RCNN ============
+
+layer {
+  name: "roi_pool_conv5"
+  type: "ROIPooling"
+  bottom: "conv5"
+  bottom: "rois"
+  top: "roi_pool_conv5"
+  roi_pooling_param {
+    pooled_w: 6
+    pooled_h: 6
+    spatial_scale: 0.0625 # 1/16
+  }
+}
+layer {
+  name: "fc6"
+  type: "InnerProduct"
+  bottom: "roi_pool_conv5"
+  top: "fc6"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu6"
+  type: "ReLU"
+  bottom: "fc6"
+  top: "fc6"
+}
+layer {
+  name: "drop6"
+  type: "Dropout"
+  bottom: "fc6"
+  top: "fc6"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "fc7"
+  type: "InnerProduct"
+  bottom: "fc6"
+  top: "fc7"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 4096
+  }
+}
+layer {
+  name: "relu7"
+  type: "ReLU"
+  bottom: "fc7"
+  top: "fc7"
+}
+layer {
+  name: "drop7"
+  type: "Dropout"
+  bottom: "fc7"
+  top: "fc7"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "cls_score"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "cls_score"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 21
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "bbox_pred"
+  type: "InnerProduct"
+  bottom: "fc7"
+  top: "bbox_pred"
+  param { lr_mult: 1.0 }
+  param { lr_mult: 2.0 }
+  inner_product_param {
+    num_output: 84
+    weight_filler {
+      type: "gaussian"
+      std: 0.001
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+layer {
+  name: "loss_cls"
+  type: "SoftmaxWithLoss"
+  bottom: "cls_score"
+  bottom: "labels"
+  propagate_down: 1
+  propagate_down: 0
+  top: "cls_loss"
+  loss_weight: 1
+  loss_param {
+    ignore_label: -1
+    normalize: true
+  }
+}
+layer {
+  name: "loss_bbox"
+  type: "SmoothL1Loss"
+  bottom: "bbox_pred"
+  bottom: "bbox_targets"
+  bottom: 'bbox_inside_weights'
+  bottom: 'bbox_outside_weights'
+  top: "bbox_loss"
+  loss_weight: 1
+}
diff --git a/models/intel_optimized_models/googlenet/bdw/solver_dummydata.prototxt b/models/intel_optimized_models/googlenet/bdw/solver_dummydata.prototxt
new file mode 100644
index 000000000..87411ad22
--- /dev/null
+++ b/models/intel_optimized_models/googlenet/bdw/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/googlenet/bdw/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/googlenet/bdw/googlenet_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/googlenet/knl/solver_dummydata.prototxt b/models/intel_optimized_models/googlenet/knl/solver_dummydata.prototxt
new file mode 100644
index 000000000..f45255f93
--- /dev/null
+++ b/models/intel_optimized_models/googlenet/knl/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/googlenet/knl/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/googlenet/knl/googlenet_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/googlenet/skx/solver_dummydata.prototxt b/models/intel_optimized_models/googlenet/skx/solver_dummydata.prototxt
new file mode 100644
index 000000000..bc2d576a0
--- /dev/null
+++ b/models/intel_optimized_models/googlenet/skx/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/googlenet/skx/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/googlenet/skx/googlenet_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/googlenet_v2/bdw/solver_dummydata.prototxt b/models/intel_optimized_models/googlenet_v2/bdw/solver_dummydata.prototxt
new file mode 100644
index 000000000..a823d4fec
--- /dev/null
+++ b/models/intel_optimized_models/googlenet_v2/bdw/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/googlenet_v2/bdw/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/googlenet_v2/bdw/googlenet_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/googlenet_v2/knl/solver_dummydata.prototxt b/models/intel_optimized_models/googlenet_v2/knl/solver_dummydata.prototxt
new file mode 100644
index 000000000..3009ea077
--- /dev/null
+++ b/models/intel_optimized_models/googlenet_v2/knl/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/googlenet_v2/knl/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/googlenet_v2/knl/googlenet_v2_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/googlenet_v2/skx/solver_dummydata.prototxt b/models/intel_optimized_models/googlenet_v2/skx/solver_dummydata.prototxt
new file mode 100644
index 000000000..b4f4578de
--- /dev/null
+++ b/models/intel_optimized_models/googlenet_v2/skx/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/googlenet_v2/skx/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/googlenet_v2/skx/googlenet_v2_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/multinode/googlenet_v2_8nodes/train_val.prototxt b/models/intel_optimized_models/multinode/googlenet_v2_8nodes/train_val.prototxt
index 23cde7452..a55529e6d 100644
--- a/models/intel_optimized_models/multinode/googlenet_v2_8nodes/train_val.prototxt
+++ b/models/intel_optimized_models/multinode/googlenet_v2_8nodes/train_val.prototxt
@@ -8,7 +8,7 @@ layer {
   type: "Data"
   data_param {
     source: "examples/imagenet/ilsvrc12_train_lmdb"
-    batch_size: 96
+    batch_size: 64 
     backend: LMDB
     shuffle: true
   }
diff --git a/models/intel_optimized_models/multinode/resnet_50_16_nodes_2k_batch/solver.prototxt b/models/intel_optimized_models/multinode/resnet_50_16_nodes_2k_batch/solver.prototxt
new file mode 100644
index 000000000..2356cc00f
--- /dev/null
+++ b/models/intel_optimized_models/multinode/resnet_50_16_nodes_2k_batch/solver.prototxt
@@ -0,0 +1,19 @@
+net: "models/intel_optimized_models/multinode/resnet_50_16_nodes_2k_batch/train_val.prototxt"
+test_iter: 1000
+test_interval: 625
+test_initialization: false
+display: 40
+base_lr: 0.8
+lr_policy: "multistep"
+stepvalue:18750
+stepvalue:37500
+stepvalue:50000
+gamma: 0.1
+max_iter: 56300
+warmup_iter: 3125 # 1281167 / 2048 * 5 epochs
+warmup_start_lr: 0.1
+momentum: 0.9
+weight_decay: 0.0001
+snapshot: 6250
+snapshot_prefix: "models/intel_optimized_models/multinode/resnet_50_16_nodes_2k_batch/resnet_50_16_nodes_2k"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/multinode/resnet_50_16_nodes_2k_batch/train_val.prototxt b/models/intel_optimized_models/multinode/resnet_50_16_nodes_2k_batch/train_val.prototxt
new file mode 100644
index 000000000..75bff6269
--- /dev/null
+++ b/models/intel_optimized_models/multinode/resnet_50_16_nodes_2k_batch/train_val.prototxt
@@ -0,0 +1,3322 @@
+name: "ResNet-50"
+bn_stats_batch_size: 32
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    scale: 0.0078125
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+    random_aspect_ratio_param {
+      min_area_ratio: 0.08
+      max_area_ratio: 1
+      aspect_ratio_change: 0.75
+      resize_param {
+        interp_mode: CUBIC
+      }
+    }
+  }
+  data_param {
+    source: "examples/imagenet/ilsvrc12_train_lmdb"
+    batch_size: 128
+    backend: LMDB
+    prefetch: 2
+    shuffle: true
+  }
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 224
+    scale: 0.0078125
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+    random_resize_param {
+      min_size: 256
+      max_size: 256
+      resize_param {
+        interp_mode: CUBIC
+      }
+    }
+  }
+  data_param {
+    source: "examples/imagenet/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+
+layer {
+  bottom: "data"
+  top: "conv1"
+  name: "conv1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 7
+    pad: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+      variance_norm: FAN_OUT
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "bn_conv1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "scale_conv1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "conv1_relu"
+  type: "ReLU"
+  relu_param {
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "pool1"
+  name: "pool1"
+  type: "Pooling"
+  pooling_param {
+    kernel_size: 3
+    stride: 2
+    pool: MAX
+  }
+}
+
+layer {
+  bottom: "pool1"
+  top: "res2a_branch1"
+  name: "res2a_branch1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  top: "res2a_branch1"
+  name: "bn2a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  top: "res2a_branch1"
+  name: "scale2a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "pool1"
+  top: "res2a_branch2a"
+  name: "res2a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "bn2a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "scale2a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "res2a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2b"
+  name: "res2a_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "bn2a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "scale2a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "res2a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2c"
+  name: "res2a_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2c"
+  top: "res2a_branch2c"
+  name: "bn2a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2c"
+  top: "res2a_branch2c"
+  name: "scale2a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  bottom: "res2a_branch2c"
+  top: "res2a"
+  name: "res2a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a"
+  top: "res2a"
+  name: "res2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a"
+  top: "res2b_branch2a"
+  name: "res2b_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "bn2b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "scale2b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "res2b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2b"
+  name: "res2b_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "bn2b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "scale2b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "res2b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2c"
+  name: "res2b_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2c"
+  top: "res2b_branch2c"
+  name: "bn2b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2c"
+  top: "res2b_branch2c"
+  name: "scale2b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a"
+  bottom: "res2b_branch2c"
+  top: "res2b"
+  name: "res2b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b"
+  top: "res2b"
+  name: "res2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b"
+  top: "res2c_branch2a"
+  name: "res2c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "bn2c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "scale2c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "res2c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2b"
+  name: "res2c_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "bn2c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "scale2c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "res2c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2c"
+  name: "res2c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2c"
+  top: "res2c_branch2c"
+  name: "bn2c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2c"
+  top: "res2c_branch2c"
+  name: "scale2c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b"
+  bottom: "res2c_branch2c"
+  top: "res2c"
+  name: "res2c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res2c"
+  name: "res2c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res3a_branch1"
+  name: "res3a_branch1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  top: "res3a_branch1"
+  name: "bn3a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  top: "res3a_branch1"
+  name: "scale3a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res3a_branch2a"
+  name: "res3a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "bn3a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "scale3a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "res3a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2b"
+  name: "res3a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "bn3a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "scale3a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "res3a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2c"
+  name: "res3a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2c"
+  top: "res3a_branch2c"
+  name: "bn3a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2c"
+  top: "res3a_branch2c"
+  name: "scale3a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  bottom: "res3a_branch2c"
+  top: "res3a"
+  name: "res3a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a"
+  top: "res3a"
+  name: "res3a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a"
+  top: "res3b_branch2a"
+  name: "res3b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "bn3b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "scale3b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "res3b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2b"
+  name: "res3b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "bn3b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "scale3b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "res3b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2c"
+  name: "res3b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2c"
+  top: "res3b_branch2c"
+  name: "bn3b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2c"
+  top: "res3b_branch2c"
+  name: "scale3b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a"
+  bottom: "res3b_branch2c"
+  top: "res3b"
+  name: "res3b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b"
+  top: "res3b"
+  name: "res3b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b"
+  top: "res3c_branch2a"
+  name: "res3c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "bn3c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "scale3c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "res3c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2b"
+  name: "res3c_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "bn3c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "scale3c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "res3c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2c"
+  name: "res3c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2c"
+  top: "res3c_branch2c"
+  name: "bn3c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2c"
+  top: "res3c_branch2c"
+  name: "scale3c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b"
+  bottom: "res3c_branch2c"
+  top: "res3c"
+  name: "res3c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c"
+  top: "res3c"
+  name: "res3c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c"
+  top: "res3d_branch2a"
+  name: "res3d_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "bn3d_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "scale3d_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "res3d_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2b"
+  name: "res3d_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "bn3d_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "scale3d_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "res3d_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2c"
+  name: "res3d_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2c"
+  top: "res3d_branch2c"
+  name: "bn3d_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2c"
+  top: "res3d_branch2c"
+  name: "scale3d_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c"
+  bottom: "res3d_branch2c"
+  top: "res3d"
+  name: "res3d"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res3d"
+  name: "res3d_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res4a_branch1"
+  name: "res4a_branch1"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  top: "res4a_branch1"
+  name: "bn4a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  top: "res4a_branch1"
+  name: "scale4a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res4a_branch2a"
+  name: "res4a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "bn4a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "scale4a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "res4a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2b"
+  name: "res4a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "bn4a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "scale4a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "res4a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2c"
+  name: "res4a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2c"
+  top: "res4a_branch2c"
+  name: "bn4a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2c"
+  top: "res4a_branch2c"
+  name: "scale4a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  bottom: "res4a_branch2c"
+  top: "res4a"
+  name: "res4a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a"
+  top: "res4a"
+  name: "res4a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a"
+  top: "res4b_branch2a"
+  name: "res4b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "bn4b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "scale4b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "res4b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2b"
+  name: "res4b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "bn4b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "scale4b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "res4b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2c"
+  name: "res4b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2c"
+  top: "res4b_branch2c"
+  name: "bn4b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2c"
+  top: "res4b_branch2c"
+  name: "scale4b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a"
+  bottom: "res4b_branch2c"
+  top: "res4b"
+  name: "res4b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b"
+  top: "res4b"
+  name: "res4b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b"
+  top: "res4c_branch2a"
+  name: "res4c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "bn4c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "scale4c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "res4c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2b"
+  name: "res4c_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "bn4c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "scale4c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "res4c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2c"
+  name: "res4c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2c"
+  top: "res4c_branch2c"
+  name: "bn4c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2c"
+  top: "res4c_branch2c"
+  name: "scale4c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b"
+  bottom: "res4c_branch2c"
+  top: "res4c"
+  name: "res4c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c"
+  top: "res4c"
+  name: "res4c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c"
+  top: "res4d_branch2a"
+  name: "res4d_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "bn4d_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "scale4d_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "res4d_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2b"
+  name: "res4d_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "bn4d_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "scale4d_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "res4d_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2c"
+  name: "res4d_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2c"
+  top: "res4d_branch2c"
+  name: "bn4d_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2c"
+  top: "res4d_branch2c"
+  name: "scale4d_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c"
+  bottom: "res4d_branch2c"
+  top: "res4d"
+  name: "res4d"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d"
+  top: "res4d"
+  name: "res4d_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d"
+  top: "res4e_branch2a"
+  name: "res4e_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "bn4e_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "scale4e_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "res4e_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2b"
+  name: "res4e_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "bn4e_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "scale4e_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "res4e_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2c"
+  name: "res4e_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2c"
+  top: "res4e_branch2c"
+  name: "bn4e_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2c"
+  top: "res4e_branch2c"
+  name: "scale4e_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d"
+  bottom: "res4e_branch2c"
+  top: "res4e"
+  name: "res4e"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e"
+  top: "res4e"
+  name: "res4e_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e"
+  top: "res4f_branch2a"
+  name: "res4f_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "bn4f_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "scale4f_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "res4f_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2b"
+  name: "res4f_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "bn4f_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "scale4f_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "res4f_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2c"
+  name: "res4f_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2c"
+  top: "res4f_branch2c"
+  name: "bn4f_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2c"
+  top: "res4f_branch2c"
+  name: "scale4f_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e"
+  bottom: "res4f_branch2c"
+  top: "res4f"
+  name: "res4f"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res4f"
+  name: "res4f_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res5a_branch1"
+  name: "res5a_branch1"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  top: "res5a_branch1"
+  name: "bn5a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  top: "res5a_branch1"
+  name: "scale5a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res5a_branch2a"
+  name: "res5a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "bn5a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "scale5a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "res5a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2b"
+  name: "res5a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "bn5a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "scale5a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "res5a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2c"
+  name: "res5a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2c"
+  top: "res5a_branch2c"
+  name: "bn5a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2c"
+  top: "res5a_branch2c"
+  name: "scale5a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  bottom: "res5a_branch2c"
+  top: "res5a"
+  name: "res5a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a"
+  top: "res5a"
+  name: "res5a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a"
+  top: "res5b_branch2a"
+  name: "res5b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "bn5b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "scale5b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "res5b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2b"
+  name: "res5b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "bn5b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "scale5b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "res5b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2c"
+  name: "res5b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2c"
+  top: "res5b_branch2c"
+  name: "bn5b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2c"
+  top: "res5b_branch2c"
+  name: "scale5b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a"
+  bottom: "res5b_branch2c"
+  top: "res5b"
+  name: "res5b"
+  type: "Eltwise"
+  eltwise_param {
+  }
+}
+
+layer {
+  bottom: "res5b"
+  top: "res5b"
+  name: "res5b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b"
+  top: "res5c_branch2a"
+  name: "res5c_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "bn5c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "scale5c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "res5c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2b"
+  name: "res5c_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "bn5c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "scale5c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "res5c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2c"
+  name: "res5c_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2c"
+  top: "res5c_branch2c"
+  name: "bn5c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2c"
+  top: "res5c_branch2c"
+  name: "scale5c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b"
+  bottom: "res5c_branch2c"
+  top: "res5c"
+  name: "res5c"
+  type: "Eltwise"
+  eltwise_param {
+  }
+}
+
+layer {
+  bottom: "res5c"
+  top: "res5c"
+  name: "res5c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5c"
+  top: "pool5"
+  name: "pool5"
+  type: "Pooling"
+  pooling_param {
+    kernel_size: 7
+    stride: 1
+    pool: AVE
+  }
+}
+
+layer {
+  bottom: "pool5"
+  top: "fc1000"
+  name: "fc1000"
+  type: "InnerProduct"
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss"
+  name: "prob"
+  type: "SoftmaxWithLoss"
+}
+layer {
+  name: "loss3/top-1"
+  type: "Accuracy"
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss3/top-1"
+}
+layer {
+  name: "loss3/top-5"
+  type: "Accuracy"
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss3/top-5"
+  accuracy_param {
+    top_k: 5
+  }
+}
diff --git a/models/intel_optimized_models/multinode/resnet_50_8_nodes/solver.prototxt b/models/intel_optimized_models/multinode/resnet_50_8_nodes/solver.prototxt
new file mode 100644
index 000000000..182626503
--- /dev/null
+++ b/models/intel_optimized_models/multinode/resnet_50_8_nodes/solver.prototxt
@@ -0,0 +1,17 @@
+net: "models/intel_optimized_models/multinode/resnet_50_8_nodes/train_val.prototxt"
+test_iter: 1000
+test_interval: 1248
+test_initialization: false
+display: 40
+base_lr: 0.4
+lr_policy: "multistep"
+stepvalue:37440
+stepvalue:74880
+stepvalue:99840
+gamma: 0.1
+max_iter: 112600
+momentum: 0.9
+weight_decay: 0.0001
+snapshot: 12480
+snapshot_prefix: "models/intel_optimized_models/multinode/resnet_50_8_nodes/resnet_50_8_nodes"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/multinode/resnet_50_8_nodes/train_val.prototxt b/models/intel_optimized_models/multinode/resnet_50_8_nodes/train_val.prototxt
new file mode 100644
index 000000000..75bff6269
--- /dev/null
+++ b/models/intel_optimized_models/multinode/resnet_50_8_nodes/train_val.prototxt
@@ -0,0 +1,3322 @@
+name: "ResNet-50"
+bn_stats_batch_size: 32
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TRAIN
+  }
+  transform_param {
+    mirror: true
+    crop_size: 224
+    scale: 0.0078125
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+    random_aspect_ratio_param {
+      min_area_ratio: 0.08
+      max_area_ratio: 1
+      aspect_ratio_change: 0.75
+      resize_param {
+        interp_mode: CUBIC
+      }
+    }
+  }
+  data_param {
+    source: "examples/imagenet/ilsvrc12_train_lmdb"
+    batch_size: 128
+    backend: LMDB
+    prefetch: 2
+    shuffle: true
+  }
+}
+layer {
+  name: "data"
+  type: "Data"
+  top: "data"
+  top: "label"
+  include {
+    phase: TEST
+  }
+  transform_param {
+    mirror: false
+    crop_size: 224
+    scale: 0.0078125
+    mean_value: 104
+    mean_value: 117
+    mean_value: 123
+    random_resize_param {
+      min_size: 256
+      max_size: 256
+      resize_param {
+        interp_mode: CUBIC
+      }
+    }
+  }
+  data_param {
+    source: "examples/imagenet/ilsvrc12_val_lmdb"
+    batch_size: 50
+    backend: LMDB
+  }
+}
+
+layer {
+  bottom: "data"
+  top: "conv1"
+  name: "conv1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 7
+    pad: 3
+    stride: 2
+    weight_filler {
+      type: "msra"
+      variance_norm: FAN_OUT
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "bn_conv1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "scale_conv1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "conv1"
+  name: "conv1_relu"
+  type: "ReLU"
+  relu_param {
+  }
+}
+
+layer {
+  bottom: "conv1"
+  top: "pool1"
+  name: "pool1"
+  type: "Pooling"
+  pooling_param {
+    kernel_size: 3
+    stride: 2
+    pool: MAX
+  }
+}
+
+layer {
+  bottom: "pool1"
+  top: "res2a_branch1"
+  name: "res2a_branch1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  top: "res2a_branch1"
+  name: "bn2a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  top: "res2a_branch1"
+  name: "scale2a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "pool1"
+  top: "res2a_branch2a"
+  name: "res2a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "bn2a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "scale2a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2a"
+  name: "res2a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a_branch2a"
+  top: "res2a_branch2b"
+  name: "res2a_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "bn2a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "scale2a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2b"
+  name: "res2a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a_branch2b"
+  top: "res2a_branch2c"
+  name: "res2a_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2c"
+  top: "res2a_branch2c"
+  name: "bn2a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2a_branch2c"
+  top: "res2a_branch2c"
+  name: "scale2a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a_branch1"
+  bottom: "res2a_branch2c"
+  top: "res2a"
+  name: "res2a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a"
+  top: "res2a"
+  name: "res2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2a"
+  top: "res2b_branch2a"
+  name: "res2b_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "bn2b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "scale2b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2a"
+  name: "res2b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b_branch2a"
+  top: "res2b_branch2b"
+  name: "res2b_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "bn2b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "scale2b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2b"
+  name: "res2b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b_branch2b"
+  top: "res2b_branch2c"
+  name: "res2b_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2c"
+  top: "res2b_branch2c"
+  name: "bn2b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2b_branch2c"
+  top: "res2b_branch2c"
+  name: "scale2b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2a"
+  bottom: "res2b_branch2c"
+  top: "res2b"
+  name: "res2b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b"
+  top: "res2b"
+  name: "res2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2b"
+  top: "res2c_branch2a"
+  name: "res2c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 64
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "bn2c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "scale2c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2a"
+  name: "res2c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c_branch2a"
+  top: "res2c_branch2b"
+  name: "res2c_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "bn2c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "scale2c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2b"
+  name: "res2c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c_branch2b"
+  top: "res2c_branch2c"
+  name: "res2c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2c"
+  top: "res2c_branch2c"
+  name: "bn2c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res2c_branch2c"
+  top: "res2c_branch2c"
+  name: "scale2c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2b"
+  bottom: "res2c_branch2c"
+  top: "res2c"
+  name: "res2c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res2c"
+  name: "res2c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res3a_branch1"
+  name: "res3a_branch1"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  top: "res3a_branch1"
+  name: "bn3a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  top: "res3a_branch1"
+  name: "scale3a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res2c"
+  top: "res3a_branch2a"
+  name: "res3a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "bn3a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "scale3a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2a"
+  name: "res3a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a_branch2a"
+  top: "res3a_branch2b"
+  name: "res3a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "bn3a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "scale3a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2b"
+  name: "res3a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a_branch2b"
+  top: "res3a_branch2c"
+  name: "res3a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2c"
+  top: "res3a_branch2c"
+  name: "bn3a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3a_branch2c"
+  top: "res3a_branch2c"
+  name: "scale3a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a_branch1"
+  bottom: "res3a_branch2c"
+  top: "res3a"
+  name: "res3a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a"
+  top: "res3a"
+  name: "res3a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3a"
+  top: "res3b_branch2a"
+  name: "res3b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "bn3b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "scale3b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2a"
+  name: "res3b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b_branch2a"
+  top: "res3b_branch2b"
+  name: "res3b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "bn3b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "scale3b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2b"
+  name: "res3b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b_branch2b"
+  top: "res3b_branch2c"
+  name: "res3b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2c"
+  top: "res3b_branch2c"
+  name: "bn3b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3b_branch2c"
+  top: "res3b_branch2c"
+  name: "scale3b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3a"
+  bottom: "res3b_branch2c"
+  top: "res3b"
+  name: "res3b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b"
+  top: "res3b"
+  name: "res3b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3b"
+  top: "res3c_branch2a"
+  name: "res3c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "bn3c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "scale3c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2a"
+  name: "res3c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c_branch2a"
+  top: "res3c_branch2b"
+  name: "res3c_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "bn3c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "scale3c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2b"
+  name: "res3c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c_branch2b"
+  top: "res3c_branch2c"
+  name: "res3c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2c"
+  top: "res3c_branch2c"
+  name: "bn3c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3c_branch2c"
+  top: "res3c_branch2c"
+  name: "scale3c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3b"
+  bottom: "res3c_branch2c"
+  top: "res3c"
+  name: "res3c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c"
+  top: "res3c"
+  name: "res3c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3c"
+  top: "res3d_branch2a"
+  name: "res3d_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "bn3d_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "scale3d_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2a"
+  name: "res3d_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d_branch2a"
+  top: "res3d_branch2b"
+  name: "res3d_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 128
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "bn3d_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "scale3d_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2b"
+  name: "res3d_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d_branch2b"
+  top: "res3d_branch2c"
+  name: "res3d_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2c"
+  top: "res3d_branch2c"
+  name: "bn3d_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res3d_branch2c"
+  top: "res3d_branch2c"
+  name: "scale3d_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3c"
+  bottom: "res3d_branch2c"
+  top: "res3d"
+  name: "res3d"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res3d"
+  name: "res3d_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res4a_branch1"
+  name: "res4a_branch1"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  top: "res4a_branch1"
+  name: "bn4a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  top: "res4a_branch1"
+  name: "scale4a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res3d"
+  top: "res4a_branch2a"
+  name: "res4a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "bn4a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "scale4a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2a"
+  name: "res4a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a_branch2a"
+  top: "res4a_branch2b"
+  name: "res4a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "bn4a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "scale4a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2b"
+  name: "res4a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a_branch2b"
+  top: "res4a_branch2c"
+  name: "res4a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2c"
+  top: "res4a_branch2c"
+  name: "bn4a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4a_branch2c"
+  top: "res4a_branch2c"
+  name: "scale4a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a_branch1"
+  bottom: "res4a_branch2c"
+  top: "res4a"
+  name: "res4a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a"
+  top: "res4a"
+  name: "res4a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4a"
+  top: "res4b_branch2a"
+  name: "res4b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "bn4b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "scale4b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2a"
+  name: "res4b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b_branch2a"
+  top: "res4b_branch2b"
+  name: "res4b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "bn4b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "scale4b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2b"
+  name: "res4b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b_branch2b"
+  top: "res4b_branch2c"
+  name: "res4b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2c"
+  top: "res4b_branch2c"
+  name: "bn4b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4b_branch2c"
+  top: "res4b_branch2c"
+  name: "scale4b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4a"
+  bottom: "res4b_branch2c"
+  top: "res4b"
+  name: "res4b"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b"
+  top: "res4b"
+  name: "res4b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4b"
+  top: "res4c_branch2a"
+  name: "res4c_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "bn4c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "scale4c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2a"
+  name: "res4c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c_branch2a"
+  top: "res4c_branch2b"
+  name: "res4c_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "bn4c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "scale4c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2b"
+  name: "res4c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c_branch2b"
+  top: "res4c_branch2c"
+  name: "res4c_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2c"
+  top: "res4c_branch2c"
+  name: "bn4c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4c_branch2c"
+  top: "res4c_branch2c"
+  name: "scale4c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4b"
+  bottom: "res4c_branch2c"
+  top: "res4c"
+  name: "res4c"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c"
+  top: "res4c"
+  name: "res4c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4c"
+  top: "res4d_branch2a"
+  name: "res4d_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "bn4d_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "scale4d_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2a"
+  name: "res4d_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d_branch2a"
+  top: "res4d_branch2b"
+  name: "res4d_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "bn4d_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "scale4d_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2b"
+  name: "res4d_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d_branch2b"
+  top: "res4d_branch2c"
+  name: "res4d_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2c"
+  top: "res4d_branch2c"
+  name: "bn4d_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4d_branch2c"
+  top: "res4d_branch2c"
+  name: "scale4d_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4c"
+  bottom: "res4d_branch2c"
+  top: "res4d"
+  name: "res4d"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d"
+  top: "res4d"
+  name: "res4d_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4d"
+  top: "res4e_branch2a"
+  name: "res4e_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "bn4e_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "scale4e_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2a"
+  name: "res4e_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e_branch2a"
+  top: "res4e_branch2b"
+  name: "res4e_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "bn4e_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "scale4e_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2b"
+  name: "res4e_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e_branch2b"
+  top: "res4e_branch2c"
+  name: "res4e_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2c"
+  top: "res4e_branch2c"
+  name: "bn4e_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4e_branch2c"
+  top: "res4e_branch2c"
+  name: "scale4e_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4d"
+  bottom: "res4e_branch2c"
+  top: "res4e"
+  name: "res4e"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e"
+  top: "res4e"
+  name: "res4e_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4e"
+  top: "res4f_branch2a"
+  name: "res4f_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "bn4f_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "scale4f_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2a"
+  name: "res4f_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f_branch2a"
+  top: "res4f_branch2b"
+  name: "res4f_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 256
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "bn4f_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "scale4f_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2b"
+  name: "res4f_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f_branch2b"
+  top: "res4f_branch2c"
+  name: "res4f_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 1024
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2c"
+  top: "res4f_branch2c"
+  name: "bn4f_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res4f_branch2c"
+  top: "res4f_branch2c"
+  name: "scale4f_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4e"
+  bottom: "res4f_branch2c"
+  top: "res4f"
+  name: "res4f"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res4f"
+  name: "res4f_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res5a_branch1"
+  name: "res5a_branch1"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  top: "res5a_branch1"
+  name: "bn5a_branch1"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  top: "res5a_branch1"
+  name: "scale5a_branch1"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res4f"
+  top: "res5a_branch2a"
+  name: "res5a_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "bn5a_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "scale5a_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2a"
+  name: "res5a_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a_branch2a"
+  top: "res5a_branch2b"
+  name: "res5a_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 2
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "bn5a_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "scale5a_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2b"
+  name: "res5a_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a_branch2b"
+  top: "res5a_branch2c"
+  name: "res5a_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2c"
+  top: "res5a_branch2c"
+  name: "bn5a_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5a_branch2c"
+  top: "res5a_branch2c"
+  name: "scale5a_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a_branch1"
+  bottom: "res5a_branch2c"
+  top: "res5a"
+  name: "res5a"
+  type: "Eltwise"
+  eltwise_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a"
+  top: "res5a"
+  name: "res5a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5a"
+  top: "res5b_branch2a"
+  name: "res5b_branch2a"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "bn5b_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "scale5b_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2a"
+  name: "res5b_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b_branch2a"
+  top: "res5b_branch2b"
+  name: "res5b_branch2b"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "bn5b_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "scale5b_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2b"
+  name: "res5b_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b_branch2b"
+  top: "res5b_branch2c"
+  name: "res5b_branch2c"
+  type: "Convolution"
+  convolution_param {
+
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2c"
+  top: "res5b_branch2c"
+  name: "bn5b_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5b_branch2c"
+  top: "res5b_branch2c"
+  name: "scale5b_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5a"
+  bottom: "res5b_branch2c"
+  top: "res5b"
+  name: "res5b"
+  type: "Eltwise"
+  eltwise_param {
+  }
+}
+
+layer {
+  bottom: "res5b"
+  top: "res5b"
+  name: "res5b_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5b"
+  top: "res5c_branch2a"
+  name: "res5c_branch2a"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "bn5c_branch2a"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "scale5c_branch2a"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2a"
+  name: "res5c_branch2a_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5c_branch2a"
+  top: "res5c_branch2b"
+  name: "res5c_branch2b"
+  type: "Convolution"
+  convolution_param {
+    num_output: 512
+    kernel_size: 3
+    pad: 1
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "bn5c_branch2b"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+    filler { value: 1 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "scale5c_branch2b"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2b"
+  name: "res5c_branch2b_relu"
+  type: "ReLU"
+  relu_param {
+  }
+}
+
+layer {
+  bottom: "res5c_branch2b"
+  top: "res5c_branch2c"
+  name: "res5c_branch2c"
+  type: "Convolution"
+  convolution_param {
+    num_output: 2048
+    kernel_size: 1
+    pad: 0
+    stride: 1
+    bias_term: false
+    weight_filler {
+      type: "msra"
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2c"
+  top: "res5c_branch2c"
+  name: "bn5c_branch2c"
+  type: "BatchNorm"
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  param { lr_mult: 0 }
+  batch_norm_param {
+    moving_average_fraction: 0.9
+  filler { value: 0 }
+  }
+}
+
+layer {
+  bottom: "res5c_branch2c"
+  top: "res5c_branch2c"
+  name: "scale5c_branch2c"
+  type: "Scale"
+  param { decay_mult: 0 }
+  param { decay_mult: 0 }
+  scale_param {
+    bias_term: true
+  }
+}
+
+layer {
+  bottom: "res5b"
+  bottom: "res5c_branch2c"
+  top: "res5c"
+  name: "res5c"
+  type: "Eltwise"
+  eltwise_param {
+  }
+}
+
+layer {
+  bottom: "res5c"
+  top: "res5c"
+  name: "res5c_relu"
+  type: "ReLU"
+  relu_param {
+
+  }
+}
+
+layer {
+  bottom: "res5c"
+  top: "pool5"
+  name: "pool5"
+  type: "Pooling"
+  pooling_param {
+    kernel_size: 7
+    stride: 1
+    pool: AVE
+  }
+}
+
+layer {
+  bottom: "pool5"
+  top: "fc1000"
+  name: "fc1000"
+  type: "InnerProduct"
+  inner_product_param {
+    num_output: 1000
+    weight_filler {
+      type: "gaussian"
+      std: 0.01
+    }
+    bias_filler {
+      type: "constant"
+      value: 0
+    }
+  }
+}
+
+layer {
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss"
+  name: "prob"
+  type: "SoftmaxWithLoss"
+}
+layer {
+  name: "loss3/top-1"
+  type: "Accuracy"
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss3/top-1"
+}
+layer {
+  name: "loss3/top-5"
+  type: "Accuracy"
+  bottom: "fc1000"
+  bottom: "label"
+  top: "loss3/top-5"
+  accuracy_param {
+    top_k: 5
+  }
+}
diff --git a/models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt b/models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt
index 1b55e4c7d..b3444fa4c 100644
--- a/models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt
+++ b/models/intel_optimized_models/multinode/vgg_16_8nodes/solver.prototxt
@@ -1,5 +1,5 @@
 net: "models/intel_optimized_models/multinode/vgg_16_8nodes/train_val.prototxt"
-test_iter: 1563
+test_iter: 1000
 test_interval: 10000
 momentum: 0.9
 weight_decay: 0.0005
diff --git a/models/intel_optimized_models/resnet_50/bdw/solver_dummydata.prototxt b/models/intel_optimized_models/resnet_50/bdw/solver_dummydata.prototxt
new file mode 100644
index 000000000..07c4dac00
--- /dev/null
+++ b/models/intel_optimized_models/resnet_50/bdw/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/resnet_50/bdw/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/resnet_50/bdw/resnet_50_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/resnet_50/knl/solver_dummydata.prototxt b/models/intel_optimized_models/resnet_50/knl/solver_dummydata.prototxt
new file mode 100644
index 000000000..4dd554e8c
--- /dev/null
+++ b/models/intel_optimized_models/resnet_50/knl/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/resnet_50/knl/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/resnet_50/knl/resnet_50_train"
+solver_mode: CPU
diff --git a/models/intel_optimized_models/resnet_50/skx/solver_dummydata.prototxt b/models/intel_optimized_models/resnet_50/skx/solver_dummydata.prototxt
new file mode 100644
index 000000000..e3e78b51d
--- /dev/null
+++ b/models/intel_optimized_models/resnet_50/skx/solver_dummydata.prototxt
@@ -0,0 +1,25 @@
+#This is Intel(R) optimized (in terms of time to train) version of solver for model described in the [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) publication.
+#Original solver.prototxt can be found in /models/bvlc_alexnet/ directory of this repository.
+#Differences:
+#- lr_policy is set to poly instead of step
+#- base_lr is decreased to 0.007
+#- max_iter is decreased to 250000
+#- power is set to 0.6
+#
+#Top-5 and Top-1 results achieved with this version of solver:
+#Top-5: 80.4%
+#Top-1: 57.4%
+#Training was performed using server equipped with Intel(R) Xeon Phi(TM) CPU 7250 processor. 
+net: "models/intel_optimized_models/resnet_50/skx/train_val_dummydata.prototxt"
+test_iter: 1000
+test_interval: 10000
+base_lr: 0.007
+lr_policy: "poly"
+power: 0.6
+display: 1
+max_iter: 5000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 50000
+snapshot_prefix: "models/intel_optimized_models/resnet_50/skx/resnet_50_train"
+solver_mode: CPU
diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index 1f3ab6f7d..9b4ccb30b 100755
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -140,11 +140,11 @@ def get_layer_label(layer, rankdir):
                       pooling_types_dict[layer.pooling_param.pool],
                       layer.type,
                       separator,
-                      layer.pooling_param.kernel_size,
+                      layer.pooling_param.kernel_size[0] if len(layer.pooling_param.kernel_size) else 1,
                       separator,
-                      layer.pooling_param.stride,
+                      layer.pooling_param.stride[0] if len(layer.pooling_param.stride) else 1,
                       separator,
-                      layer.pooling_param.pad)
+                      layer.pooling_param.pad[0] if len(layer.pooling_param.pad) else 0)
     else:
         node_label = '"%s%s(%s)"' % (layer.name, separator, layer.type)
     return node_label
diff --git a/scripts/prepare_env.sh b/scripts/prepare_env.sh
index 76e39b471..906e885ae 100755
--- a/scripts/prepare_env.sh
+++ b/scripts/prepare_env.sh
@@ -1,75 +1,198 @@
 #!/bin/bash
 
+function usage
+{
+    script_name=$0
+    echo "Usage:"
+    echo "  $script_name [--host host_file]"
+    echo ""
+    echo "  Parameters:"
+    echo "    host: host file includes list of nodes. Only used when you want to install dependencies for multinode"
+}
+
+function check_os
+{
+    # echo "Check OS and the version..."
+    echo "Only CentOS is supported."
+}
+
+function check_dependency
+{
+    dep=$1
+    which $dep >/dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        echo "Warning: cannot find $dep"
+        return 1
+    fi
+    return 0
+}
+
+
+sudo_passwd=""
+
+function is_sudoer
+{
+    echo $sudo_passwd | sudo -S -E -v >/dev/null
+    if [ $? -eq 1 ]; then
+        echo "User $(whoami) is not sudoer, and cannot install dependencies."
+	exit 1
+    fi
+}
+
+# centos: yum; ubuntu: apt-get
 os="centos"
+install_command=""
 
+check_os
+if [ "$os" == "centos" ]; then
+    install_command="yum"
+    check_dependency $install_command
+    if [ $? -ne 0 ]; then
+        echo "Please check if CentOS and $install_command is installed correctly."
+        exit 1
+    fi
+fi
+
+package_installer="$install_command -y "
 username=`whoami`
 if [ "$username" != "root" ];
 then
-    package_installer="sudo -E"
+    read -s -p "Enter password for $username: " sudo_passwd
+    is_sudoer
+    package_installer="echo $sudo_passwd | sudo -S -E $install_command -y "
 fi
 
-# centos: yum; ubuntu: apt-get
-package_installer+=" yum -y"
+
 
 function install_deps
 {
     echo "Install dependencies..."
     if [ "$os" == "centos" ]; then
-        $package_installer clean all
-    	$package_installer upgrade
-        $package_installer install epel-release
-        $package_installer groupinstall "Development Tools"
+        eval $package_installer clean all
+        eval $package_installer upgrade
+        eval $package_installer install epel-release
+        eval $package_installer groupinstall "Development Tools"
     fi
-    
-    $package_installer install python-devel boost boost-devel cmake numpy \
+
+    eval $package_installer install python-devel boost boost-devel cmake numpy \
         numpy-devel gflags gflags-devel glog glog-devel protobuf protobuf-devel hdf5 \
         hdf5-devel lmdb lmdb-devel leveldb leveldb-devel snappy-devel opencv \
         opencv-devel wget bc numactl
 }
 
-function check_os
+function install_deps_multinode
 {
-    echo "Check OS and the version..."
-}
+    host_file=$1
+    host_list=(`cat $host_file | sort | uniq`)
 
+    host_cnt=${#host_list[@]}
+    if [ $host_cnt -eq 0 ]; then
+        echo "Error: empty host list. Exit."
+        exit 1
+    fi
 
-function checkout_source
-{
-    echo "Checkout source code of Intel Caffe..."
-    git clone https://github.com/intel/caffe.git
-    if [ $? -eq 128 ]; then
-        echo "Error during checking out source code. Please set proxy as below:"
-        echo "    export https_proxy=https://username:password@proxy.com:port"
+    echo "Make sure you're executing command on host ${host_list[0]}"
+
+    echo $sudo_passwd | sudo -S -E yum -y clean all
+
+    if [ "$os" == "centos" ]; then
+        eval $package_installer upgrade
+        eval $package_installer install epel-release
+        eval $package_installer clean all
+        eval $package_installer groupinstall "Development Tools"
     fi
+
+    eval $package_installer install ansible
+
+    tmp_host_file=ansible_hosts.tmp
+    ansible_host_file=/etc/ansible/hosts
+    echo -e "[ourmaster]\n${host_list[0]}\n[ourcluster]\n" >$tmp_host_file
+    for ((i=1; i<${#host_list[@]}; i++))
+    do
+        echo -e "${host_list[$i]}\n" >>$tmp_host_file
+    done
+    $command_prefix mv -f $tmp_host_file $ansible_host_file
+
+    ssh-keygen -t rsa -q
+    for host in ${host_list[@]}
+    do
+        ssh-copy-id -i ~/.ssh/id_rsa.pub $host
+    done
+    ansible ourcluster -m ping
+
+    ansible all -m shell -a "$package_installer install python-devel boost boost-devel cmake numpy numpy-devel gflags gflags-devel glog glog-devel protobuf protobuf-devel hdf5 hdf5-devel lmdb lmdb-devel leveldb leveldb-devel snappy-devel opencv opencv-devel"
+
+    ansible all -m shell -a "$package_installer install mc cpuinfo htop tmux screen iftop iperf vim wget bc numactl"
+    ansible all -m shell -a "systemctl stop firewalld.service"
 }
+
 function build_caffe
 {
+    is_multinode_=$1
+
     echo "Build Intel Caffe..."
     cp Makefile.config.example Makefile.config
+
+    if [ $is_multinode_ -eq 1 ]; then
+        echo "USE_MLSL := 1" >> Makefile.config
+        echo "CAFFE_PER_LAYER_TIMINGS := 1" >> Makefile.config
+	
+        mlslvars_sh=`find external/mlsl/ -name mlslvars.sh`
+        if [ -f $mlslvars_sh ]; then
+            source $mlslvars_sh
+        fi
+    fi
+
     make -j 8
 }
 
-function is_sudoer
+function sync_caffe_dir
 {
-    sudo -v >/dev/null
-    if [ $? -eq 1 ]; then
-        echo "User $(whoami) is not sudoer, and cannot install dependencies."
-        return 1
-    fi
-    return 0
+  caffe_dir=`pwd`
+  caffe_parent_dir=`dirname $caffe_dir`
+  ansible ourcluster -m synchronize -a "src=$caffe_dir dest=$caffe_parent_dir"
 }
 
-check_os
-if [ "$os" == "ubuntu" ]; then
-    package_installer="apt-get"
-fi
 
-is_sudoer
+host_file=""
+while [[ $# -gt 1 ]]
+do
+    key="$1"
+    case $key in
+        --host)
+            host_file="$2"
+            shift
+            ;;
+        --help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $key"
+            usage
+            exit 1
+            ;;
+    esac
+    shift
+done
+
+
 if [ $? -eq 0 ]; then
-    install_deps
+    if [ "$host_file" == "" ]; then
+        install_deps
+    else
+        install_deps_multinode $host_file
+    fi
 fi
 
+is_multinode=0
+if [ "$host_file" != "" ]; then
+    is_multinode=1
+fi
+build_caffe $is_multinode
 
-build_caffe
+if [ $is_multinode -eq 1 ]; then
+  sync_caffe_dir
+fi
 
 echo "Done."
diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
index 63cccc8b5..a4911a65c 100755
--- a/scripts/run_benchmark.sh
+++ b/scripts/run_benchmark.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 
 # model path
 model_path="models/intel_optimized_models"
@@ -12,27 +12,43 @@ declare -a model_list=("alexnet" "googlenet" "googlenet_v2" "resnet_50")
 # it's assigned by detect_cpu
 cpu_model="skx"
 
+# flag used to mark if we have detected which cpu model we're using
+unknown_cpu=0
+
 # specify default engine for running caffe benchmarks
-engine="MKL2017"
+engine="MKLDNN"
+
+# default support single node
+numnodes=1
 
-# directory path to save results
-result_dir=""
+# intelcaffe_log_file obtain outputs of 'run_intelcaffe'
+intelcaffe_log_file=""
 
 # specific script used to run intelcaffe 
 caffe_bin="./scripts/run_intelcaffe.sh"
 
-# Iterations to run benchmark
-iterations=5
+# iterations to run benchmark
+iterations=100
+
+# hostfile needed to run multinodes mode benchmark
+host_file=""
+
+# network parameters
+network="opa"
+tcp_netmask=""
 
 function usage
 {
     script_name=$0
     echo "Usage:"
-    echo "   $script_name --topology network_topology"
+    echo "   $script_name --topology network_topology [--host_file host_file] [--network opa/tcp] [--netmask tcp_netmask]"
     echo ""
     echo "   Parameters:"
     echo "     topology: network topology used to benchmark, support alexnet, googlenet, googlenet_v2, resnet_50"
     echo "               , by specifying it as 'all', we run all supported topologies."
+    echo "     host_file: host_file needed in multinodes mode, should contain list of nodes ips or hostnames"
+    echo "     network: opa(default), tcp, used in multinodes mode to specify the network type"
+    echo "     netmask: only used if network is tcp, set as the net work card name within your network"
     echo ""
 }
 
@@ -48,9 +64,21 @@ function is_supported_topology
     fi
 }
 
+function calculate_numnodes
+{
+    if [[ $host_file != "" ]]; then
+        host_list=(`cat $host_file | sort | uniq`)
+        numnodes=${#host_list[@]}
+        if [ $numnodes -eq 0 ]; then
+            echo "Error: empty host list. Exit."
+            exit 1
+        fi
+    fi
+    echo "Number of nodes: $numnodes"
+}
+
 function detect_cpu
 {
-    # detect cpu model
     model_string=`lscpu | grep "Model name" | awk -F ':' '{print $2}'`
     if [[ $model_string == *"72"* ]]; then
         cpu_model="knl"
@@ -61,23 +89,119 @@ function detect_cpu
     elif [[ $model_string == *"E5-26"* ]]; then
         cpu_model="bdw"
     else
-        echo "Will use default settings, which may not be the optimal one."
+        unknown_cpu=1
+        echo "Can't detect which cpu model currently using, will use default settings, which may not be the optimal one."
     fi
 }
 
 function run_specific_model
 {
-    model_file="models/intel_optimized_models/${model}/${cpu_model}/train_val_dummydata.prototxt"
-    exec_command="${caffe_bin} --model_file ${model_file} --mode time --iteration ${iterations} --benchmark none"
-    $exec_command
+    if [ $numnodes -eq 1 ]; then
+        model_file="models/intel_optimized_models/${model}/${cpu_model}/train_val_dummydata.prototxt"
+        exec_command="${caffe_bin} --model_file ${model_file} --mode time --iteration ${iterations} --benchmark none"
+    else
+        solver_file="models/intel_optimized_models/${model}/${cpu_model}/solver_dummydata.prototxt"
+        exec_command="${caffe_bin} --host $host_file --solver $solver_file --network $network --netmask $tcp_netmask --benchmark none"
+    fi 
+
+    # Result file to save detailed run intelcaffe results
+    if [ $unknown_cpu -eq 0 ]; then
+        result_log_file="result-${cpu_model}-${model}-`date +%Y%m%d%H%M%S`.log"
+    else
+        result_log_file="result-unknown-${model}-`date +%Y%m%d%H%M%S`.log"
+    fi
+    $exec_command > $result_log_file 2>&1
+    obtain_intelcaffe_log $result_log_file
+    calculate_images_per_second $intelcaffe_log_file 
+}
+
+function obtain_intelcaffe_log
+{
+    echo "Result_log_file : $1"
+    if [ -f $1 ]; then
+       result_dir_line=`cat $1 | grep "Result folder:"`
+       if [[ result_dir_line = "" ]]; then
+           echo "Couldn't find result folder within file $1"
+           exit 1
+       fi
+       result_dir=`echo $result_dir_line | awk -F ' ' '{print $(NF)}'`
+       if [ $unknown_cpu -eq 0 ]; then
+           caffe_log_file="outputCluster-${cpu_model}-${numnodes}.txt"
+       else
+           caffe_log_file="outputCluster-unknown-${numnodes}.txt"
+       fi
+       intelcaffe_log_file="${result_dir}/${caffe_log_file}"
+    else
+       echo "Couldn't see result log file $result_log_file"
+       exit 1
+    fi
+}
+
+function obtain_average_fwd_bwd_time
+{
+    result_file=$1
+    if [ ! -f $result_file ]; then
+        echo "Error: result file $result_file does not exist..."
+        exit 1
+    fi
+
+    if [ $numnodes -eq 1 ]; then
+        average_time_line=`cat $result_file | grep "Average Forward-Backward"`
+        average_time=`echo $average_time_line | awk -F ' ' '{print $(NF-1)}'`
+    else
+        start_iteration=1000
+        iteration_num=100
+        total_time=0
+        deltaTimeList=`cat $result_file | grep "DELTA TIME" | tail -n "+${start_iteration}" | head -n ${iteration_num} | awk '{print $(NF-1)}'`
+        
+        for delta_time in ${deltaTimeList}
+        do
+            iteration_time=`echo "$delta_time" | bc`
+            total_time=`echo "$total_time+$iteration_time" | bc`
+        done
+
+        average_time=`echo "$total_time*1000/$iteration_num" | bc`
+    fi
+    echo "average time: ${average_time}"
+}
+
+function obtain_batch_size
+{
+    log_file=$1
+    if [ ! -f $log_file ]; then
+        echo "Error: log file $log_file does not exist..."
+        exit 1
+    fi
+    if [ $numnodes -eq 1 ]; then
+        batch_size=`cat $log_file | grep shape | sed -n "3, 1p" | awk '{print $(NF-4)}'`
+    else
+        batch_size=`cat $log_file | grep SetMinibatchSize | sed -n "1, 1p" | awk '{print $(NF)}'`
+    fi
+    echo "batch size: $batch_size"
 }
 
+function calculate_images_per_second 
+{
+    obtain_batch_size $1
+    obtain_average_fwd_bwd_time $1
+    if [ $numnodes -eq 1 ]; then
+        speed=`echo "$batch_size*1000/$average_time" | bc`
+    else
+        speed=`echo "$batch_size*$numnodes*1000/$average_time" | bc`
+    fi
+    echo "benchmark speed : $speed images/sec"
+}
+
+
 function run_benchmark
 {
+    detect_cpu
+    calculate_numnodes
     echo "Cpu model : $model_string"
     if [[ $topology = "all" ]]; then
        for ((i=0; i<${#model_list[@]}; i++))
        do
+          echo "--${model_list[$i]}"
           model=${model_list[$i]}
           run_specific_model
        done
@@ -87,12 +211,29 @@ function run_benchmark
     fi 
 }
 
+function check_parameters
+{
+    if [[ $topology = "" ]]; then
+        echo "Error: topology is not specified."
+        usage
+        exit 1
+    fi
+    
+    if [[ $host_file != "" ]]; then
+        if [ "$network" = "tcp" -a "$tcp_netmask" = "" ]; then
+            echo "Error: need to specify tcp network's netmask"
+            usage
+            exit 1
+        fi
+    fi
+    is_supported_topology
+}
+
 if [[ $# -le 1 ]]; then
     usage
     exit 0
 fi
 
-root_dir=$(cd $(dirname $(dirname $0)); pwd)
 while [[ $# -gt 1 ]]
 do
     key="$1"
@@ -101,6 +242,18 @@ do
             topology="$2"
             shift
             ;;
+        --host_file)
+            host_file="$2"
+            shift
+            ;;
+        --network)
+            network="$2"
+            shift
+            ;;
+        --netmask)
+            tcp_netmask="$2"
+            shift
+            ;;
         *)
             echo "Unknown option: $key"
             usage
@@ -110,16 +263,6 @@ do
     shift
 done
 
-# check parameters
-if [[ $topology = "" ]]; then
-    echo "Error: topology is not specified."
-    exit 1
-fi
-
-# check if input topology is supported
-is_supported_topology
-
-detect_cpu
+check_parameters
 
-# start running benchmark
-run_benchmark 
+run_benchmark
diff --git a/scripts/run_intelcaffe.sh b/scripts/run_intelcaffe.sh
index b7e6c3429..6440f916c 100755
--- a/scripts/run_intelcaffe.sh
+++ b/scripts/run_intelcaffe.sh
@@ -1,6 +1,10 @@
-#!/bin/sh
+#!/bin/bash
+
+benchmark_mode="none"
+
+# by default, run intel caffe on single node
+numnodes=1
 
-benchmark_mode="all"
 
 # time/train/resume_train
 mode="train"
@@ -27,7 +31,7 @@ snapshot=""
 solver_file=""
 
 # specify engine for running caffe
-engine="MKL2017"
+engine="MKLDNN"
 
 #default numa node if needed
 numanode=0
@@ -52,16 +56,14 @@ function usage
     echo "               [--mpibench_param mpibench_param]"
     echo ""
     echo "  Parameters:"
-    echo "    mode: train(default), resume_train, time, none(not to run caffe test)"
-    echo ""
-    echo "  Optional parameters:"
     echo "    host: host file includes list of nodes. Only used when you're running multinodes mode"
     echo "    solver: need to be specified a solver file if mode is train/resume_train"
     echo "    network: opa(default), tcp"
     echo "    netmask: only used if network is tcp"
     echo "    debug: off(default). MLSL debug information is outputed if it's on"
-    echo "    benchmark: all(default). Includes qperf, all-reduce performance"
-    echo "      Dependency: user needs to install qperf, IMB-MPI1;"
+    echo "    mode: train(default), resume_train, time, none(not to run caffe test)"
+    echo "    benchmark: none(disabled by default). Includes qperf, all-reduce performance"
+    echo "      Dependency: user needs to install qperf, Intel MPI library (including IMB-MPI1);"
     echo "                  and add them in system path."
     echo "    iteration and model_file: only used if mode is time (caffe time)"
     echo "    snapshot: only used if mode is resume_train"
@@ -71,7 +73,7 @@ function usage
     echo "    mpibench_param: allreduce (default). parameter of mpi benchmark."
 }
 
-declare -a cpu_list=("Intel Xeon E5-26xx (Broadwell)" "Intel Xeon Phi 72xx (Knight Landing)" 
+declare -a cpu_list=("Intel Xeon E5-26xx (Broadwell)" "Intel Xeon Phi 72xx (Knights Landing)" 
                      "Intel Xeon Platinum 8180 (Skylake)" "Intel Xeon 6148 (Skylake)")
 
 function detect_cpu
@@ -89,19 +91,17 @@ function detect_cpu
     else
         cpu_model="unknown"
         echo "CPU model :$model_string is unknown."
-        echo "Will use default settings, which may not be the optimal one."
+        echo "    Use default settings, which may not be the optimal one."
     fi
 }
 
 function set_numa_node
 {
-    numa_bin="numactl -H"
-    # check if os has 'libnuma' package ready
-    check_dependency $numa_bin
+    check_dependency numactl
     if [ $? -ne 0 ]; then
-       echo "No numactl command support."
-       return
-    fi 
+        return
+    fi
+
     # detect numa mode: cache and flat mode for KNL
     numa_node=($(numactl -H | grep "available" | awk -F ' ' '{print $2}'))
     if [ $numa_node -eq 1 ]; then
@@ -129,6 +129,10 @@ function check_dependency
 
 function init_mpi_envs
 {
+    if [ ${numnodes} -eq 1 ]; then
+        return
+    fi
+
     # IMPI configuration
     if [ "$network" == "opa" ]; then
         export I_MPI_FABRICS=tmi
@@ -197,7 +201,7 @@ function clear_envs
 
 function set_mlsl_vars
 {
-    if [ "${num_mlsl_servers}" -eq -1 ]; then
+    if [ ${num_mlsl_servers} -eq -1 ]; then
         if [ ${numnodes} -eq 1 ]; then
             numservers=0
         else
@@ -261,24 +265,18 @@ function execute_command
     local xeonbin_=$1
     local result_dir_=$2
 
-    if [ "${cpu_model}" == "bdw" ] || [ "${cpu_model}" == "skx" ]; then
+    if [ "${cpu_model}" == "bdw" ]; then
         exec_command="$xeonbin_"
+    elif [ "${cpu_model}" == "skx" ]; then
+        exec_command="numactl -l $xeonbin_"
     else
-        numa_bin="numactl --preferred=$numanode"
-        # check if os has 'libnuma' package ready
-        check_dependency $numa_bin
-        if [ $? -ne 0 ]; then
-           echo "No numactl command support."
-           exec_command="$xeonbin_"
-        else
-           exec_command="$numa_bin $xeonbin_"
-        fi
+        exec_command="numactl --preferred=$numanode $xeonbin_"
     fi
 
     if [ ${numnodes} -gt 1 ]; then
         # Produce the configuration file for mpiexec. 
         # Each line of the config file contains a # host, environment, binary name.
-        cfile_=nodeconfig-${cpu_model}-${numnodes}.txt
+        cfile_=$result_dir_/nodeconfig-${cpu_model}-${numnodes}.txt
         rm -f $cfile_
 
         for node in "${nodenames[@]}"
@@ -286,7 +284,7 @@ function execute_command
             echo "-host ${node} -n $ppncpu $exec_command" >> $cfile_
         done
     fi
-    log_file=outputCluster-${cpu_model}-${numnodes}.txt
+    log_file=$result_dir_/outputCluster-${cpu_model}-${numnodes}.txt
 
     clear_envs
 
@@ -294,54 +292,21 @@ function execute_command
     check_dependency $sensors_bin
     has_sensors=$?
     if [ $has_sensors -eq 0 ]; then
-        sensor_log_file=sensors-${cpu_model}-${numnodes}-start.log
+        sensor_log_file=$result_dir_/sensors-${cpu_model}-${numnodes}-start.log
         $sensors_bin >$sensor_log_file
-        mv $sensor_log_file $result_dir_/
     fi
     
     if [ ${numnodes} -eq 1 ]; then
-        time GLOG_minloglevel=0 $exec_command >${log_file} 2>&1
+        time GLOG_minloglevel=0 $exec_command 2>&1 | tee ${log_file}
     else
         exec_command="-l -configfile $cfile_"
-        time GLOG_minloglevel=0 mpiexec.hydra $exec_command >${log_file} 2>&1 
+        time GLOG_minloglevel=0 mpiexec.hydra $exec_command 2>&1 | tee ${log_file}
     fi
 
     if [ $has_sensors -eq 0 ]; then
-        sensor_log_file=sensors-${cpu_model}-${numnodes}-end.log
+        sensor_log_file=$result_dir_/sensors-${cpu_model}-${numnodes}-end.log
         $sensors_bin >$sensor_log_file
-        mv $sensor_log_file $result_dir_/
     fi
-    mv $log_file $cfile_ $result_dir_/
-}
-
-# used to calculate images / s
-function obtain_average_fwd_bwd_time
-{
-    result_file="${result_dir}/${log_file}"
-    if [ -f $result_file ]; then
-        average_time_line=`cat $result_file | grep "Average Forward-Backward"`
-        average_time=`echo $average_time_line | awk -F ' ' '{print $(NF-1)}'`
-        echo "average time : ${average_time} ms"
-    else
-        echo "Error: result file $result_file does not exist..."
-        exit 1
-    fi
-}
-
-# used to calculate images / s
-function obtain_batch_size
-{
-    # catch batch size of training
-    batch_size=`cat $model_file | grep shape | sed -n "1, 1p" | awk '{print $4}'`
-    echo "batch size : $batch_size"
-}
-
-function calculate_images_per_second 
-{
-    obtain_batch_size
-    obtain_average_fwd_bwd_time
-    speed=`echo "$batch_size*1000/$average_time" | bc`
-    echo "benchmark speed : $speed images/s"
 }
 
 function run_qperf_bench
@@ -398,8 +363,13 @@ function run_mpi_bench
 
     mpibench_bin_bname=`basename $mpibench_bin`
 
-    declare -a adjust_values=(1 2 3 5 7 8 9 0)
-    declare -a collective_values=('tmi' 'none')
+    if [ "${benchmark_mode}" == "all" ]; then
+        declare -a adjust_values=(1 2 3 5 7 8 9 0)
+        declare -a collective_values=('tmi' 'none')
+    else
+        declare -a adjust_values=(0)
+        declare -a collective_values=('none')
+    fi
 
     echo "Start mpi bench..."
     for ((i=0; i<${#adjust_values[@]}; i++))
@@ -442,7 +412,7 @@ function run_benchmark
             run_qperf_bench
         fi
 
-        if [ "$benchmark_mode" == "all" ] || [ "$benchmark_mode == mpi" ]; then
+        if [ "$benchmark_mode" == "all" ] || [ "$benchmark_mode" == "mpi" ]; then
             set_env_vars
             run_mpi_bench
         fi
@@ -451,9 +421,7 @@ function run_benchmark
 
 function run_caffe
 {
-    if [[ $host_file != "" ]]; then
-        echo "Run caffe with ${numnodes} nodes..."
-    fi
+    echo "Run caffe with ${numnodes} nodes..."
 
     if [ ${mode} == "time" ]; then
         xeonbin="$caffe_bin time --iterations $iteration --model $model_file  -engine=$engine"
@@ -464,13 +432,8 @@ function run_caffe
         fi
     fi
 
-    if [[ $host_file != "" ]]; then 
-        set_env_vars
-    fi
+    set_env_vars
     execute_command "$xeonbin" $result_dir
-    if [ ${mode} == "time" ]; then
-        calculate_images_per_second 
-    fi
 }
 
 
@@ -590,7 +553,7 @@ if [ "$mode" == "train" ] || [ "$mode" == "resume_train" ]; then
             exit 1
         fi
         if [ ! -f $snapshot ]; then
-            echo "Eror: snapshot file does NOT exist."
+            echo "Error: snapshot file does NOT exist."
             exit 1
         fi
         echo "    Snapshot for resuming train: $snapshot"
@@ -603,7 +566,7 @@ if [ "$mode" == "time" ]; then
         exit 1
     fi
     if [ ! -f $model_file ]; then
-        echo "Eror: model file does NOT exist."
+        echo "Error: model file does NOT exist."
         exit 1
     fi
 
@@ -633,19 +596,15 @@ if [[ $host_file != "" ]]; then
         exit 0
     fi
     numnodes=${#nodenames[@]}
-else
-    numnodes=1
 fi
 echo "    Number of nodes: $numnodes"
 
 detect_cpu
 
-if [ "$cpu_model" == "knl" ]; then
-    set_numa_node
-fi
+set_numa_node
 
 if [ ! -d $result_dir ]; then
-    #echo "Create result directory: $result_dir"
+    echo "Create result directory: $result_dir"
     mkdir -p $result_dir
 fi
 
diff --git a/src/caffe/data_reader.cpp b/src/caffe/data_reader.cpp
index 69e8f20f5..4c9ea32f0 100644
--- a/src/caffe/data_reader.cpp
+++ b/src/caffe/data_reader.cpp
@@ -49,7 +49,6 @@ namespace caffe {
 
 using boost::weak_ptr;
 
-map<const string, weak_ptr<DataReader::Body> > DataReader::bodies_;
 static boost::mutex bodies_mutex_;
 
 DataReader::DataReader(const LayerParameter& param)
diff --git a/src/caffe/layer.cpp b/src/caffe/layer.cpp
index ad6422694..9db843056 100644
--- a/src/caffe/layer.cpp
+++ b/src/caffe/layer.cpp
@@ -72,7 +72,7 @@ mn::Distribution & Layer<Dtype>::GetDistribution() {
 template <typename Dtype>
 bool Layer<Dtype>::Bypass(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  return GetDistribution().get_group_id() > 0;
+  return GetDistribution().get_global_part_id() > 0;
 }
 
 template <typename Dtype>
@@ -99,7 +99,9 @@ void Layer<Dtype>::MultinodeSetUp(const vector<Blob<Dtype>*>& bottom,
     CHECK_GT(shape.size(), 0);
     oc = shape[0];
     if (shape.size() > 1) ic = shape[1];
-    if (shape.size() >= 4) hw = shape[2] * shape[3];
+    for (int k = 2; k < shape.size(); k++) {
+      hw *= shape[k];
+    }
     // Note that MLSL expects the entire weights from a model group.
     // So we should multiply by model_parts here.
     reg_info.add_parameter_set<Dtype>(ic * oc * model_parts, hw);
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 0a6f83a21..fa37e24ec 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -55,6 +55,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/layers/sigmoid_layer.hpp"
 #include "caffe/layers/softmax_layer.hpp"
 #include "caffe/layers/tanh_layer.hpp"
+#include "caffe/layers/deconv_layer.hpp"
 #ifdef MKL2017_SUPPORTED
 #include "caffe/layers/mkl_layers.hpp"
 #endif
diff --git a/src/caffe/layers/box_annotator_ohem_layer.cpp b/src/caffe/layers/box_annotator_ohem_layer.cpp
new file mode 100644
index 000000000..2b89e67ee
--- /dev/null
+++ b/src/caffe/layers/box_annotator_ohem_layer.cpp
@@ -0,0 +1,168 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <cfloat>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/layers/box_annotator_ohem_layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+using std::max;
+using std::min;
+using std::floor;
+using std::ceil;
+
+namespace caffe {
+
+  template <typename Dtype>
+  void BoxAnnotatorOHEMLayer<Dtype>::LayerSetUp(
+    const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+    BoxAnnotatorOHEMParameter box_anno_param =
+      this->layer_param_.box_annotator_ohem_param();
+    roi_per_img_ = box_anno_param.roi_per_img();
+    CHECK_GT(roi_per_img_, 0);
+    ignore_label_ = box_anno_param.ignore_label();
+  }
+
+  template <typename Dtype>
+  void BoxAnnotatorOHEMLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+    num_ = bottom[0]->num();
+    CHECK_EQ(5, bottom[0]->channels());
+    height_ = bottom[0]->height();
+    width_ = bottom[0]->width();
+    spatial_dim_ = height_*width_;
+
+    CHECK_EQ(bottom[1]->num(), num_);
+    CHECK_EQ(bottom[1]->channels(), 1);
+    CHECK_EQ(bottom[1]->height(), height_);
+    CHECK_EQ(bottom[1]->width(), width_);
+
+    CHECK_EQ(bottom[2]->num(), num_);
+    CHECK_EQ(bottom[2]->channels(), 1);
+    CHECK_EQ(bottom[2]->height(), height_);
+    CHECK_EQ(bottom[2]->width(), width_);
+
+    CHECK_EQ(bottom[3]->num(), num_);
+    bbox_channels_ = bottom[3]->channels();
+    CHECK_EQ(bottom[3]->height(), height_);
+    CHECK_EQ(bottom[3]->width(), width_);
+
+    // Labels for scoring
+    top[0]->Reshape(num_, 1, height_, width_);
+    // Loss weights for bbox regression
+    top[1]->Reshape(num_, bbox_channels_, height_, width_);
+  }
+
+  template <typename Dtype>
+  void BoxAnnotatorOHEMLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+        const Dtype* bottom_rois = bottom[0]->cpu_data();
+    const Dtype* bottom_loss = bottom[1]->cpu_data();
+    const Dtype* bottom_labels = bottom[2]->cpu_data();
+    const Dtype* bottom_bbox_loss_weights = bottom[3]->cpu_data();
+    Dtype* top_labels = top[0]->mutable_cpu_data();
+    Dtype* top_bbox_loss_weights = top[1]->mutable_cpu_data();
+    caffe_set(top[0]->count(), Dtype(ignore_label_), top_labels);
+    caffe_set(top[1]->count(), Dtype(0), top_bbox_loss_weights);
+
+    int num_rois_ = bottom[1]->count();
+
+    int num_imgs = -1;
+    for (int n = 0; n < num_rois_; n++) {
+      for (int s = 0; s < spatial_dim_; s++) {
+        num_imgs = bottom_rois[0] > num_imgs ? bottom_rois[0] : num_imgs;
+        bottom_rois++;
+      }
+      bottom_rois += (5 - 1) * spatial_dim_;
+    }
+    num_imgs++;
+    CHECK_GT(num_imgs, 0)
+      << "number of images must be greater than 0 at BoxAnnotatorOHEMLayer";
+    bottom_rois = bottom[0]->cpu_data();
+
+    // Find rois with max loss
+    vector<int> sorted_idx(num_rois_);
+    for (int i = 0; i < num_rois_; i++) {
+      sorted_idx[i] = i;
+    }
+    std::sort(sorted_idx.begin(), sorted_idx.end(),
+      [bottom_loss](int i1, int i2) {
+        return bottom_loss[i1] > bottom_loss[i2];
+    });
+
+    // Generate output labels for scoring and loss_weights for bbox regression
+    vector<int> number_left(num_imgs, roi_per_img_);
+    for (int i = 0; i < num_rois_; i++) {
+      int index = sorted_idx[i];
+      int s = index % (width_ * height_);
+      int n = index / (width_ * height_);
+      int batch_ind = bottom_rois[n * 5 * spatial_dim_ + s];
+      if (number_left[batch_ind] > 0) {
+        number_left[batch_ind]--;
+        top_labels[index] = bottom_labels[index];
+        for (int j = 0; j < bbox_channels_; j++) {
+          int bbox_index = (n * bbox_channels_ + j) * spatial_dim_ + s;
+          top_bbox_loss_weights[bbox_index] = bottom_bbox_loss_weights[bbox_index];
+        }
+      }
+    }
+  }
+
+  template <typename Dtype>
+  void BoxAnnotatorOHEMLayer<Dtype>::Backward_cpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+    return;
+  }
+
+
+#ifdef CPU_ONLY
+  STUB_GPU(BoxAnnotatorOHEMLayer);
+#endif
+
+  INSTANTIATE_CLASS(BoxAnnotatorOHEMLayer);
+  REGISTER_LAYER_CLASS(BoxAnnotatorOHEM);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/box_annotator_ohem_layer.cu b/src/caffe/layers/box_annotator_ohem_layer.cu
new file mode 100644
index 000000000..5d2b5f974
--- /dev/null
+++ b/src/caffe/layers/box_annotator_ohem_layer.cu
@@ -0,0 +1,119 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+// ------------------------------------------------------------------
+// R-FCN
+// Written by Yi Li
+// ------------------------------------------------------------------
+
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layers/box_annotator_ohem_layer.hpp"
+
+using std::max;
+using std::min;
+
+namespace caffe {
+  template <typename Dtype>
+  void BoxAnnotatorOHEMLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+    const Dtype* bottom_rois = bottom[0]->cpu_data();
+    const Dtype* bottom_loss = bottom[1]->cpu_data();
+    const Dtype* bottom_labels = bottom[2]->cpu_data();
+    const Dtype* bottom_bbox_loss_weights = bottom[3]->cpu_data();
+    Dtype* top_labels = top[0]->mutable_cpu_data();
+    Dtype* top_bbox_loss_weights = top[1]->mutable_cpu_data();
+    caffe_set(top[0]->count(), Dtype(ignore_label_), top_labels);
+    caffe_set(top[1]->count(), Dtype(0), top_bbox_loss_weights);
+
+    int num_rois_ = bottom[1]->count();
+
+    int num_imgs = -1;
+    for (int n = 0; n < num_rois_; n++) {
+      for (int s = 0; s < spatial_dim_; s++) {
+        num_imgs = bottom_rois[0] > num_imgs ? bottom_rois[0] : num_imgs;
+        bottom_rois++;
+      }
+      bottom_rois += (5 - 1) * spatial_dim_;
+    }
+    num_imgs++;
+    CHECK_GT(num_imgs, 0)
+      << "number of images must be greater than 0 at BoxAnnotatorOHEMLayer";
+    bottom_rois = bottom[0]->cpu_data();
+
+    // Find rois with max loss
+    vector<int> sorted_idx(num_rois_);
+    for (int i = 0; i < num_rois_; i++) {
+      sorted_idx[i] = i;
+    }
+    std::sort(sorted_idx.begin(), sorted_idx.end(),
+      [bottom_loss](int i1, int i2) {
+        return bottom_loss[i1] > bottom_loss[i2];
+    });
+
+    // Generate output labels for scoring and loss_weights for bbox regression
+    vector<int> number_left(num_imgs, roi_per_img_);
+    for (int i = 0; i < num_rois_; i++) {
+      int index = sorted_idx[i];
+      int s = index % (width_*height_);
+      int n = index / (width_*height_);
+      int batch_ind = bottom_rois[n*5*spatial_dim_+s];
+      if (number_left[batch_ind] > 0) {
+        number_left[batch_ind]--;
+        top_labels[index] = bottom_labels[index];
+        for (int j = 0; j < bbox_channels_; j++) {
+          int bbox_index = (n*bbox_channels_+j)*spatial_dim_+s;
+          top_bbox_loss_weights[bbox_index] =
+            bottom_bbox_loss_weights[bbox_index];
+        }
+      }
+    }
+  }
+
+  template <typename Dtype>
+  void BoxAnnotatorOHEMLayer<Dtype>::Backward_gpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+    return;
+  }
+
+  INSTANTIATE_LAYER_GPU_FUNCS(BoxAnnotatorOHEMLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/loss_layer.cpp b/src/caffe/layers/loss_layer.cpp
index dfed5b7d0..5881475aa 100644
--- a/src/caffe/layers/loss_layer.cpp
+++ b/src/caffe/layers/loss_layer.cpp
@@ -54,8 +54,8 @@ void LossLayer<Dtype>::LayerSetUp(
 template <typename Dtype>
 void LossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(bottom[0]->num(), bottom[1]->num())
-      << "The data and label should have the same number.";
+  CHECK_EQ(bottom[0]->shape(0), bottom[1]->shape(0))
+      << "The data and label should have the same first dimension.";
   vector<int> loss_shape(0);  // Loss layers output a scalar; 0 axes.
   top[0]->Reshape(loss_shape);
 }
diff --git a/src/caffe/layers/lrn_layer.cpp b/src/caffe/layers/lrn_layer.cpp
index 75ba4c347..280e5da96 100644
--- a/src/caffe/layers/lrn_layer.cpp
+++ b/src/caffe/layers/lrn_layer.cpp
@@ -78,8 +78,8 @@ void LRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     LayerParameter pool_param;
     pool_param.mutable_pooling_param()->set_pool(
         PoolingParameter_PoolMethod_AVE);
-    pool_param.mutable_pooling_param()->set_pad(pre_pad_);
-    pool_param.mutable_pooling_param()->set_kernel_size(size_);
+    pool_param.mutable_pooling_param()->add_pad(pre_pad_);
+    pool_param.mutable_pooling_param()->add_kernel_size(size_);
     pool_layer_.reset(new PoolingLayer<Dtype>(pool_param));
     pool_layer_->SetUp(square_top_vec_, pool_top_vec_);
     // Set up power_layer_ to compute (1 + alpha_/N^2 s)^-beta_, where s is
diff --git a/src/caffe/layers/mkl_convolution_layer.cpp b/src/caffe/layers/mkl_convolution_layer.cpp
index 59c74102c..ddc9c0231 100644
--- a/src/caffe/layers/mkl_convolution_layer.cpp
+++ b/src/caffe/layers/mkl_convolution_layer.cpp
@@ -97,92 +97,10 @@ void MKLConvolutionLayer<Dtype>::compute_output_shape() {
 }
 
 template <typename Dtype>
-MKLConvolutionLayer<Dtype>::~MKLConvolutionLayer() {
-    dnnDelete<Dtype>(convolutionFwd);
-    dnnDelete<Dtype>(convolutionBwdData);
-    dnnDelete<Dtype>(convolutionBwdFilter);
-    if (this->bias_term_)
-        dnnDelete<Dtype>(convolutionBwdBias);
-}
-
-template <typename Dtype>
-void MKLConvolutionLayer<Dtype>::Init(
-      const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top) {
-  this->width_ = bottom[0]->width();
-  this->height_ = bottom[0]->height();
-  this->num_ = bottom[0]->num();
-
-  // TODO: clean up this
-  kernel_w_ = this->kernel_shape_.cpu_data()[1];
-  kernel_h_ = this->kernel_shape_.cpu_data()[0];
-  stride_w_ = this->stride_.cpu_data()[1];
-  stride_h_ = this->stride_.cpu_data()[0];
-  pad_w_ = this->pad_.cpu_data()[1];
-  pad_h_ = this->pad_.cpu_data()[0];
-
-  this->bottom_shape_ = &bottom[0]->shape();
-  compute_output_shape();
+void MKLConvolutionLayer<Dtype>::CreateFwdPrimitive() {
   int status;
-  size_t n, g;
-  size_t iw, ih, ic;
-  size_t ow, oh, oc;
-  size_t kw, kh; /* filter */
+  size_t g = std::max(this->group_, 1);
   size_t dimension = 4;
-
-  g  = std::max(this->group_, 1);
-  n  = this->num_;
-  iw = this->width_;
-  ih = this->height_;
-  ic = this->channels_;
-
-  ow = this->width_out_;
-  oh = this->height_out_;
-  oc = this->num_output_;
-
-  kw = this->kernel_w_;
-  kh = this->kernel_h_;
-
-  size_t bdata_sizes[4] = {iw, ih, ic, n};
-  size_t bdata_strides[4] = {1, iw, iw*ih, iw*ih*ic};
-
-  /* starting with MKL 2017 Gold in case of groups filter layout
-   * becomes 5D, i.e. groups become a separate dimension */
-  size_t g_mkl2017 = g;
-  size_t f_dimension = dimension + (g != 1);
-  if (getMKLBuildDate() < 20160701) {
-      g_mkl2017 = 1;
-      f_dimension = dimension;
-  }
-
-  size_t fdata_sizes[5] = {kw, kh, ic/g, oc/g_mkl2017, g_mkl2017};
-  size_t fdata_strides[5]  = {1, kw, kw*kh, kw*kh*ic/g, kw*kh*ic/g*oc/g};
-
-  size_t bias_sizes[1] = {oc};
-  size_t bias_strides[1] = {1};
-
-  size_t tdata_sizes[4] = {ow, oh, oc, n};
-  size_t tdata_strides[4]  = {1, ow, ow*oh, ow*oh*oc};
-
-  size_t convolutionStrides[2] = {this->stride_w_, this->stride_h_};
-  int    inputOffset[2] = {-this->pad_w_, -this->pad_h_};
-
-  // Names are for debugging purposes only.
-  fwd_bottom_data ->name = "fwd_bottom_data   @ " + this->layer_param_.name();
-  fwd_top_data    ->name = "fwd_top_data      @ " + this->layer_param_.name();
-  fwd_filter_data ->name = "fwd_filter_data   @ " + this->layer_param_.name();
-  fwd_bias_data   ->name = "fwd_bias_data     @ " + this->layer_param_.name();
-  bwdd_top_diff   ->name = "bwdd_top_diff     @ " + this->layer_param_.name();
-  bwdd_bottom_diff->name = "bwdd_bottom_diff  @ " + this->layer_param_.name();
-  bwdd_filter_data->name = "bwdd_filter_data  @ " + this->layer_param_.name();
-  bwdf_top_diff   ->name = "bwdf_top_diff     @ " + this->layer_param_.name();
-  bwdf_bottom_data->name = "bwdf_bottom_data  @ " + this->layer_param_.name();
-  bwdf_filter_diff->name = "bwdf_filter_diff  @ " + this->layer_param_.name();
-  bwdf2fwd_filter_diff->name =
-                       "bwdf2fwd_filter_diff  @ " + this->layer_param_.name();
-  bwdb_top_diff   ->name = "bwdb_top_diff     @ " + this->layer_param_.name();
-  bwdb_bias_diff  ->name = "bwdb_bias_diff    @ " + this->layer_param_.name();
-
   // Free MKL primitives
   dnnDelete<Dtype>(convolutionFwd);
   if (this->bias_term_) {
@@ -227,6 +145,15 @@ void MKLConvolutionLayer<Dtype>::Init(
   if (this->bias_term_)
     fwd_bias_data->create_layouts(convolutionFwd, dnnResourceBias, 1,
                                   bias_sizes, bias_strides);
+
+}
+
+template <typename Dtype>
+void MKLConvolutionLayer<Dtype>::CreateBwdDataPrimitive() {
+  int status;
+  size_t g = std::max(this->group_, 1);
+  size_t dimension = 4;
+
 /*
  * Backward by data layer setup
  */
@@ -254,6 +181,14 @@ void MKLConvolutionLayer<Dtype>::Init(
   bwdd_filter_data->create_layouts(convolutionBwdData, dnnResourceFilter,
                                    f_dimension, fdata_sizes, fdata_strides);
 
+}
+
+template <typename Dtype>
+void MKLConvolutionLayer<Dtype>::CreateBwdFilterPrimitive() {
+  int status;
+  size_t g = std::max(this->group_, 1);
+  size_t dimension = 4;
+
 /*
  * Backward by filter layer setup
  */
@@ -278,10 +213,10 @@ void MKLConvolutionLayer<Dtype>::Init(
                                    dimension, bdata_sizes, bdata_strides);
   bwdf_top_diff   ->create_layouts(convolutionBwdFilter, dnnResourceDiffDst,
                                    dimension, tdata_sizes, tdata_strides);
-  bwdf_filter_diff->create_layouts(convolutionFwd, dnnResourceFilter,
+  bwdf_filter_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffFilter,
                                    f_dimension, fdata_sizes, fdata_strides);
   // support for (iter_size > 1) requires additional buffer
-  bwdf_filter_diff_iter->create_layouts(convolutionFwd, dnnResourceFilter,
+  bwdf_filter_diff_iter->create_layouts(convolutionBwdFilter, dnnResourceDiffFilter,
                                    f_dimension, fdata_sizes, fdata_strides);
 
   // Note: this caused some trouble for older MKL
@@ -293,13 +228,19 @@ void MKLConvolutionLayer<Dtype>::Init(
         dnnResourceDiffFilter);
     bwdf2fwd_filter_diff->remove_user_layout();
     status = dnnLayoutCreateFromPrimitive<Dtype>(
-        &bwdf2fwd_filter_diff->layout_usr, convolutionFwd, dnnResourceFilter);
+        &bwdf2fwd_filter_diff->layout_usr, convolutionBwdFilter, dnnResourceDiffFilter);
     CHECK_EQ(status, 0) << "Failed dnnLayoutCreateFromPrimitive with status "
             << status << "\n";
 
     bwdf2fwd_filter_diff->create_conversions();
   }
+}
 
+template <typename Dtype>
+void MKLConvolutionLayer<Dtype>::CreateBwdBiasPrimitive() {
+  int status;
+  size_t g = std::max(this->group_, 1);
+  size_t dimension = 4;
 /*
  * Backward by bias layer setup
  */
@@ -326,6 +267,122 @@ void MKLConvolutionLayer<Dtype>::Init(
   }
 }
 
+template <typename Dtype>
+MKLConvolutionLayer<Dtype>::~MKLConvolutionLayer() {
+    dnnDelete<Dtype>(convolutionFwd);
+    dnnDelete<Dtype>(convolutionBwdData);
+    dnnDelete<Dtype>(convolutionBwdFilter);
+    if (this->bias_term_)
+        dnnDelete<Dtype>(convolutionBwdBias);
+}
+
+template <typename Dtype>
+void MKLConvolutionLayer<Dtype>::Init(
+      const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  this->width_ = bottom[0]->width();
+  this->height_ = bottom[0]->height();
+  this->num_ = bottom[0]->num();
+
+  // TODO: clean up this
+  kernel_w_ = this->kernel_shape_.cpu_data()[1];
+  kernel_h_ = this->kernel_shape_.cpu_data()[0];
+  stride_w_ = this->stride_.cpu_data()[1];
+  stride_h_ = this->stride_.cpu_data()[0];
+  pad_w_ = this->pad_.cpu_data()[1];
+  pad_h_ = this->pad_.cpu_data()[0];
+
+  this->bottom_shape_ = &bottom[0]->shape();
+  compute_output_shape();
+  size_t n, g;
+  size_t iw, ih, ic;
+  size_t ow, oh, oc;
+  size_t kw, kh; /* filter */
+  size_t dimension = 4;
+
+  g  = std::max(this->group_, 1);
+  n  = this->num_;
+  iw = this->width_;
+  ih = this->height_;
+  ic = this->channels_;
+
+  ow = this->width_out_;
+  oh = this->height_out_;
+  oc = this->num_output_;
+
+  kw = this->kernel_w_;
+  kh = this->kernel_h_;
+
+  this->bdata_sizes[0] = iw;
+  this->bdata_sizes[1] = ih;
+  this->bdata_sizes[2] = ic;
+  this->bdata_sizes[3] = n;
+
+  this->bdata_strides[0] = 1;
+  this->bdata_strides[1] = iw;
+  this->bdata_strides[2] = iw*ih;
+  this->bdata_strides[3] = iw*ih*ic;
+
+  /* starting with MKL 2017 Gold in case of groups filter layout
+   * becomes 5D, i.e. groups become a separate dimension */
+  size_t g_mkl2017 = g;
+  f_dimension = dimension + (g != 1);
+  if (getMKLBuildDate() < 20160701) {
+      g_mkl2017 = 1;
+      f_dimension = dimension;
+  }
+
+  this->fdata_sizes[0] = kw;
+  this->fdata_sizes[1] = kh;
+  this->fdata_sizes[2] = ic/g;
+  this->fdata_sizes[3] = oc/g_mkl2017;
+  this->fdata_sizes[4] = g_mkl2017;
+
+  this->fdata_strides[0] = 1;
+  this->fdata_strides[1] = kw;
+  this->fdata_strides[2] = kw*kh;
+  this->fdata_strides[3] = kw*kh*ic/g;
+  this->fdata_strides[4] = kw*kh*ic/g*oc/g;
+
+  this->bias_sizes[0] = oc;
+
+  this->bias_strides[0] = 1;
+
+  this->tdata_sizes[0] = ow;
+  this->tdata_sizes[1] = oh;
+  this->tdata_sizes[2] = oc;
+  this->tdata_sizes[3] = n;
+
+  this->tdata_strides[0]  = 1;
+  this->tdata_strides[1]  = ow;
+  this->tdata_strides[2]  = ow*oh;
+  this->tdata_strides[3]  = ow*oh*oc;
+
+  this->convolutionStrides[0] = this->stride_w_;
+  this->convolutionStrides[1] = this->stride_h_;
+
+  this->inputOffset[0] = -this->pad_w_;
+  this->inputOffset[1] = -this->pad_h_;
+
+  // Names are for debugging purposes only.
+  fwd_bottom_data ->name = "fwd_bottom_data   @ " + this->layer_param_.name();
+  fwd_top_data    ->name = "fwd_top_data      @ " + this->layer_param_.name();
+  fwd_filter_data ->name = "fwd_filter_data   @ " + this->layer_param_.name();
+  fwd_bias_data   ->name = "fwd_bias_data     @ " + this->layer_param_.name();
+  bwdd_top_diff   ->name = "bwdd_top_diff     @ " + this->layer_param_.name();
+  bwdd_bottom_diff->name = "bwdd_bottom_diff  @ " + this->layer_param_.name();
+  bwdd_filter_data->name = "bwdd_filter_data  @ " + this->layer_param_.name();
+  bwdf_top_diff   ->name = "bwdf_top_diff     @ " + this->layer_param_.name();
+  bwdf_bottom_data->name = "bwdf_bottom_data  @ " + this->layer_param_.name();
+  bwdf_filter_diff->name = "bwdf_filter_diff  @ " + this->layer_param_.name();
+  bwdf2fwd_filter_diff->name =
+                       "bwdf2fwd_filter_diff  @ " + this->layer_param_.name();
+  bwdb_top_diff   ->name = "bwdb_top_diff     @ " + this->layer_param_.name();
+  bwdb_bias_diff  ->name = "bwdb_bias_diff    @ " + this->layer_param_.name();
+
+  CreateFwdPrimitive();
+}
+
 template <typename Dtype>
 void MKLConvolutionLayer<Dtype>::LayerSetUp(
       const vector<Blob<Dtype>*>& bottom,
@@ -338,14 +395,21 @@ void MKLConvolutionLayer<Dtype>::LayerSetUp(
 template <typename Dtype>
 void MKLConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  bool reinitialize = (this->width_ == bottom[0]->width() &&
-                       this->height_ == bottom[0]->height() &&
-                       this->channels_ == bottom[0]->channels() &&
-                       this->num_ == bottom[0]->num()) ? false : true;
+  this->reshape = (this->width_ == bottom[0]->width() &&
+                   this->height_ == bottom[0]->height() &&
+                   this->channels_ == bottom[0]->channels() &&
+                   this->num_ == bottom[0]->num()) ? false : true;
 
   BaseConvolutionLayer<Dtype>::ReshapeForMKL(bottom, top);
 
-  if (reinitialize == true) {
+  if (this->reshape == true) {
+    // when reshape happens, sync weight and bias data/diff to cpu.
+    this->blobs_[0]->mutable_cpu_data();
+    this->blobs_[0]->mutable_cpu_diff();
+    if (this->bias_term_) {
+      this->blobs_[1]->mutable_cpu_data();
+      this->blobs_[1]->mutable_cpu_diff();
+    }
     Init(bottom, top);
   }
 }
@@ -436,6 +500,9 @@ void MKLConvolutionLayer<Dtype>::Backward_cpu(
   if (propagate_down[0]) {
     void *res_convolutionBwdData[dnnResourceNumber];
 
+    if ((convolutionBwdData == NULL) || this->reshape)
+      CreateBwdDataPrimitive();
+
     res_convolutionBwdData[dnnResourceDiffDst] =
       bwdd_top_diff->get_converted_prv(top[0], true);
     // Currently this conversion adds padding to weights.
@@ -463,6 +530,9 @@ void MKLConvolutionLayer<Dtype>::Backward_cpu(
   if (this->param_propagate_down(0)) {
     void *res_convolutionBwdFilter[dnnResourceNumber];
 
+    if ((convolutionBwdFilter == NULL) || this->reshape)
+      CreateBwdFilterPrimitive();
+
     res_convolutionBwdFilter[dnnResourceDiffDst] =
             bwdf_top_diff->get_converted_prv(top[0], true);
     // The last get_converted_prv() argument is a hack for reusing conversion
@@ -556,6 +626,9 @@ void MKLConvolutionLayer<Dtype>::Backward_cpu(
   if (this->param_propagate_down(1)) {
     void *res_convolutionBwdBias[dnnResourceNumber];
 
+    if ((convolutionBwdBias == NULL) || this->reshape)
+      CreateBwdBiasPrimitive();
+
     res_convolutionBwdBias[dnnResourceDiffDst] =
             bwdb_top_diff->get_converted_prv(top[0], true);
     if (Caffe::iter_size() > 1) {
diff --git a/src/caffe/layers/mkl_pooling_layer.cpp b/src/caffe/layers/mkl_pooling_layer.cpp
index 8fd2a191d..b3a1b9316 100644
--- a/src/caffe/layers/mkl_pooling_layer.cpp
+++ b/src/caffe/layers/mkl_pooling_layer.cpp
@@ -67,22 +67,22 @@ void MKLPoolingLayer<Dtype>::Init(
   num_ = bottom[0]->num();
 
   if (pool_param.global_pooling()) {
-    CHECK(!(pool_param.has_kernel_size() ||
+    CHECK(!(pool_param.kernel_size_size() ||
       pool_param.has_kernel_h() || pool_param.has_kernel_w()))
       << "With Global_pooling: true Filter size cannot specified";
   } else {
-    CHECK(!pool_param.has_kernel_size() !=
+    CHECK(!pool_param.kernel_size_size() !=
       !(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
       << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-    CHECK(pool_param.has_kernel_size() ||
+    CHECK(pool_param.kernel_size_size() ||
       (pool_param.has_kernel_h() && pool_param.has_kernel_w()))
       << "For non-square filters both kernel_h and kernel_w are required.";
   }
-  CHECK((!pool_param.has_pad() && pool_param.has_pad_h()
+  CHECK((!pool_param.pad_size() && pool_param.has_pad_h()
       && pool_param.has_pad_w())
       || (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
       << "pad is pad OR pad_h and pad_w are required.";
-  CHECK((!pool_param.has_stride() && pool_param.has_stride_h()
+  CHECK((!pool_param.stride_size() && pool_param.has_stride_h()
       && pool_param.has_stride_w())
       || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
       << "Stride is stride OR stride_h and stride_w are required.";
@@ -91,8 +91,15 @@ void MKLPoolingLayer<Dtype>::Init(
     kernel_h_ = bottom[0]->height();
     kernel_w_ = bottom[0]->width();
   } else {
-    if (pool_param.has_kernel_size()) {
-      kernel_h_ = kernel_w_ = pool_param.kernel_size();
+    if (pool_param.kernel_size_size()) {
+      CHECK(pool_param.kernel_size_size() == 1 || pool_param.kernel_size_size() == 2)
+          << "kernel_size must be specified once, or 2 values for Height and Width";
+      if (pool_param.kernel_size_size() == 1) {
+        kernel_h_ = kernel_w_ = pool_param.kernel_size(0);
+      } else {
+        kernel_h_ = pool_param.kernel_size(0);
+        kernel_w_ = pool_param.kernel_size(1);
+      }
     } else {
       kernel_h_ = pool_param.kernel_h();
       kernel_w_ = pool_param.kernel_w();
@@ -101,13 +108,33 @@ void MKLPoolingLayer<Dtype>::Init(
   CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
   CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
   if (!pool_param.has_pad_h()) {
-    pad_h_ = pad_w_ = pool_param.pad();
+    // CHECK(pool_param.pad_size() == 0 || pool_param.pad_size() == 1 || pool_param.pad_size() == 2)
+    CHECK(pool_param.pad_size() < 3)
+          << "pad must be specified no more than 3 dimensions";
+    if (pool_param.pad_size() == 0) {
+      pad_h_ = pad_w_ = 0;
+    } else if (pool_param.pad_size() == 1) {
+      pad_h_ = pad_w_ = pool_param.pad(0);
+    } else {
+      pad_h_ = pool_param.pad(0);
+      pad_w_ = pool_param.pad(1);
+    }
   } else {
     pad_h_ = pool_param.pad_h();
     pad_w_ = pool_param.pad_w();
   }
   if (!pool_param.has_stride_h()) {
-    stride_h_ = stride_w_ = pool_param.stride();
+    // CHECK(pool_param.stride_size() == 0 || pool_param.stride_size() == 1 || pool_param.stride_size() == 2)
+    CHECK(pool_param.stride_size() < 3)
+        << "stride must be specified no more than 3 dimensions";
+    if (pool_param.stride_size() == 0) {
+      stride_h_ = stride_w_ = 1;
+    } else if (pool_param.stride_size() == 1) {
+      stride_h_ = stride_w_ = pool_param.stride(0);
+    } else {
+      stride_h_ = pool_param.stride(0);
+      stride_w_ = pool_param.stride(1);
+    }
   } else {
     stride_h_ = pool_param.stride_h();
     stride_w_ = pool_param.stride_w();
@@ -131,7 +158,7 @@ void MKLPoolingLayer<Dtype>::Init(
   pooled_width_ = static_cast<int>(ceil(static_cast<float>(
       bottom[0]->width() + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
   bool force_exclude_padding_flag_ = false;
-  if (pad_h_ || pad_w_) {
+  if (pad_h_ || pad_w_ || kernel_h_ == 1 || kernel_w_ == 1) {
     // If we have padding, ensure that the last pooling starts strictly
     // inside the image (instead of at the padding); otherwise clip the last.
     if ((pooled_height_ - 1) * stride_h_ >= bottom[0]->height() + pad_h_) {
@@ -194,9 +221,7 @@ void MKLPoolingLayer<Dtype>::Init(
   }
 
 
-  size_t dim = 4;
-  size_t src_sizes[4], src_strides[4];
-  size_t dst_sizes[4], dst_strides[4];
+  dim = 4;
 
   src_sizes[0] = bottom[0]->width();
   src_sizes[1] = bottom[0]->height();
@@ -233,7 +258,6 @@ void MKLPoolingLayer<Dtype>::Init(
   bwd_top_diff->name =    "bwd_top_diff      @ " + this->layer_param_.name();
   bwd_bottom_diff->name = "bwd_bottom_diff   @ " + this->layer_param_.name();
 
-  fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides, false);
   fwd_top_data   ->create_user_layout(dim, dst_sizes, dst_strides, false);
   bwd_bottom_diff->create_user_layout(dim, src_sizes, src_strides, false);
   bwd_top_diff   ->create_user_layout(dim, dst_sizes, dst_strides, false);
@@ -258,9 +282,10 @@ void MKLPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       height_ == bottom[0]->height() &&
       width_ == bottom[0]->width() &&
       num_ == bottom[0]->num()) {
+    reshape = false;
     return;
   }
-
+  reshape = true;
   Init(bottom, top);
 }
 
@@ -285,8 +310,9 @@ void MKLPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   if (NULL == bottom_data) {
     bottom_data =
       reinterpret_cast<void *>(const_cast<Dtype*>(bottom[0]->cpu_data()));
-    if (NULL == poolingFwd) {
+    if (NULL == poolingFwd || reshape) {
       // Now create poolingFwd
+      fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides, false);
       status = dnnPoolingCreateForward<Dtype>(&poolingFwd, NULL,
               this->algorithm, fwd_bottom_data->layout_usr,
               kernel_size, kernel_stride, src_offset, dnnBorderZeros);
@@ -298,7 +324,7 @@ void MKLPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
               kernel_size, kernel_stride, src_offset, dnnBorderZeros);
       CHECK_EQ(status, E_SUCCESS);
     }
-  } else if (NULL == poolingFwd) {
+  } else if (NULL == poolingFwd || reshape) {
     // Is it the first pass? Create a primitive.
     CHECK_EQ((bottom[0]->get_prv_data_descriptor())->get_descr_type(),
             PrvMemDescr::PRV_DESCR_MKL2017);
@@ -309,7 +335,6 @@ void MKLPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
     DLOG(INFO) << "Using layout of " << mem_descr->name
             << " as input layout for " << this->layer_param_.name();
-
     // copy shared_ptr
     fwd_bottom_data = mem_descr;
 
diff --git a/src/caffe/layers/mkl_relu_layer.cpp b/src/caffe/layers/mkl_relu_layer.cpp
index 996f1012a..5bc33e43e 100644
--- a/src/caffe/layers/mkl_relu_layer.cpp
+++ b/src/caffe/layers/mkl_relu_layer.cpp
@@ -54,7 +54,7 @@ template <typename Dtype>
 void MKLReLULayer<Dtype>::Init(
       const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  size_t dim = bottom[0]->shape().size();
+  this->dim = bottom[0]->shape().size();
   this->sizes_.resize(dim);
   this->strides_.resize(dim);
   for (size_t d = 0; d < dim; ++d) {
@@ -72,8 +72,6 @@ void MKLReLULayer<Dtype>::Init(
   this->bwd_top_diff_->name =    "bwd_top_diff      @ " +
                                  this->layer_param_.name();
 
-  this->fwd_bottom_data_->create_user_layout(dim, &(this->sizes_[0]),
-                                             &(this->strides_[0]), false);
   this->fwd_top_data_   ->create_user_layout(dim, &(this->sizes_[0]),
                                              &(this->strides_[0]), false);
   this->bwd_bottom_diff_->create_user_layout(dim, &(this->sizes_[0]),
@@ -100,10 +98,11 @@ void MKLReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   NeuronLayer<Dtype>::Reshape(bottom, top);
 
   // Here I check for sizes whther to destroy primitives
-  size_t dim = bottom[0]->shape().size();
+  dim = bottom[0]->shape().size();
 
   // If dimensions of blobs are the same as they were then
   // do not really destroy primitives
+  reshape = false;
   if (dim == this->sizes_.size()) {
     // .. check for strides and size dims if they corresspond each other
 
@@ -123,7 +122,7 @@ void MKLReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       return;
     }
   }
-
+  reshape = true;
   Init(bottom, top);
 }
 
@@ -135,7 +134,7 @@ void MKLReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     reinterpret_cast<void *>(const_cast<Dtype*>(bottom[0]->prv_data()));
 
   if (bottom_data) {
-    if (reluFwd_ == NULL) {
+    if (reluFwd_ == NULL || reshape) {
       // first pass
       CHECK_EQ((bottom[0]->get_prv_data_descriptor())->get_descr_type(),
               PrvMemDescr::PRV_DESCR_MKL2017);
@@ -155,9 +154,9 @@ void MKLReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
       DLOG(INFO) << "Using layout of " << mem_descr->name
               << " as input layout for " << this->layer_param_.name();
+
       // copy shared_ptr
       fwd_bottom_data_ = mem_descr;
-
       fwd_top_data_   ->create_internal_layout(reluFwd_, dnnResourceDst);
       bwd_top_diff_   ->create_internal_layout(reluFwd_, dnnResourceDst);
       bwd_bottom_diff_->create_internal_layout(reluFwd_, dnnResourceSrc);
@@ -166,10 +165,12 @@ void MKLReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     DLOG(INFO) << "Using cpu_data in MKLReLULayer.";
     bottom_data =
       reinterpret_cast<void *>(const_cast<Dtype*>(bottom[0]->cpu_data()));
-    if (reluFwd_ == NULL) {
+    if (reluFwd_ == NULL || reshape) {
       // first pass
       dnnError_t e;
       Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
+      this->fwd_bottom_data_->create_user_layout(dim, &(this->sizes_[0]),
+                                                 &(this->strides_[0]), false);
       e = dnnReLUCreateForward<Dtype>(&reluFwd_, NULL,
               fwd_bottom_data_->layout_usr, negative_slope);
       CHECK_EQ(e, E_SUCCESS);
diff --git a/src/caffe/layers/mkldnn_batch_norm_layer.cpp b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
index f1edfebd4..f9f504e73 100644
--- a/src/caffe/layers/mkldnn_batch_norm_layer.cpp
+++ b/src/caffe/layers/mkldnn_batch_norm_layer.cpp
@@ -192,7 +192,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     int32_t ic = this->channels_;    
 
     bool bottom_data_is_prv = (const_cast<Dtype*>(bottom[0]->prv_data()) != NULL);
-
+    bool inplace = (bottom[0] == top[0]);
     engine cpu_engine = CpuEngine::Instance().get_engine();
     memory::data_type mpcsn = memory::data_type::f32;
     
@@ -249,13 +249,22 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNorm(const vector<Blob<Dtype>*>& bott
     fwd_top_data.reset(new MKLDNNData<Dtype>(usr_mpd, prv_mpd, top[0], this));
     output_memory = fwd_top_data->create_output_memory();
 
+    if(inplace) {
+        if (bottom_data_is_prv) {
+            inplace_buffer_memory.reset(new memory(*prv_mpd));
+        } else {
+            inplace_buffer_memory.reset(new memory(*usr_mpd));
+        }
+    }
+
     mean_memory.resize(num_stats_batches_);
     variance_memory.resize(num_stats_batches_);
     input_stats.resize(num_stats_batches_);
+    input_inplace_buffer.resize(num_stats_batches_);
     output_stats.resize(num_stats_batches_);
     BatchNormFwd.resize(num_stats_batches_);
     for (int i = 0; i < num_stats_batches_; i++) {
-      InitBatchNormFwdPrimitive(i);
+      InitBatchNormFwdPrimitive(i, inplace);
     }
 
     //fwd_bottom_data->set_mkldnn_primitive(BatchNormFwd);  //Wrong passed primitive! (TODO: Checking!)
@@ -303,10 +312,29 @@ shared_ptr<memory> MKLDNNBatchNormLayer<Dtype>::GetStatsBatchMemory(
 }
 
 template <typename Dtype>
-void MKLDNNBatchNormLayer<Dtype>::InitBatchNormFwdPrimitive(int idx) {
+template <bool diff>
+shared_ptr<memory> MKLDNNBatchNormLayer<Dtype>::GetStatsBatchMemoryInplace(
+    shared_ptr<MKLDNNMemoryDescriptor<Dtype, diff> > mkldnn_mem, int idx, shared_ptr<memory > buffer_memory) {
+    long data_offset =
+      idx * stats_batch_size_ * this->channels_ * this->width_ * this->height_;
+    engine cpu_engine = CpuEngine::Instance().get_engine();
+            shared_ptr<memory::desc> stats_md = mkldnn_mem->get_memory_desc();
+    CHECK(stats_md->data.ndims > 0 &&
+          stats_md->data.dims[0] == this->num_);
+    stats_md->data.dims[0] = stats_batch_size_;
+    shared_ptr<memory::primitive_desc> stats_mpd(
+      new memory::primitive_desc(*stats_md, cpu_engine));
+    shared_ptr<memory> stats(
+      new memory(*stats_mpd, static_cast<Dtype*>(buffer_memory->get_data_handle()) + data_offset));
+        return stats;
+}
+
+template <typename Dtype>
+void MKLDNNBatchNormLayer<Dtype>::InitBatchNormFwdPrimitive(int idx, bool inplace) {
     input_stats[idx] = GetStatsBatchMemory<false>(fwd_bottom_data, idx);
     output_stats[idx] = GetStatsBatchMemory<false>(fwd_top_data, idx);
-
+    if (inplace)
+        input_inplace_buffer[idx] = GetStatsBatchMemoryInplace<false>(fwd_bottom_data, idx, inplace_buffer_memory);
     // ---- Create BatchNorm --------------------
     if (this->phase_ == TEST && !use_global_stats_) {
         if (use_weight_bias_) {
@@ -357,6 +385,7 @@ void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
 #ifdef DEBUG
     LOG(INFO) << "MKLDNNBatchNormLayer<Dtype>::Forward_cpu: " << this->layer_param_.name();
 #endif
+    bool inplace = (bottom[0] == top[0]);
 
     if(BatchNormFwd_pd == NULL)
         InitBatchNorm(bottom, top);
@@ -365,6 +394,12 @@ void MKLDNNBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom
     // update top that head at prv
     fwd_top_data->sync_before_write();
 
+    if(inplace && this->phase_ == TRAIN) {
+        caffe_copy(fwd_bottom_data->get_memory_count(), 
+                    static_cast<Dtype*>(fwd_bottom_data->get_memory_ptr(0)),
+                    static_cast<Dtype *>(inplace_buffer_memory->get_data_handle()));
+    }
+
     for (int stats_batch_idx = 0; stats_batch_idx < num_stats_batches_; stats_batch_idx++) {
       if (use_global_stats_) {
         // use the stored mean/variance estimates.
@@ -489,7 +524,7 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
     bottom_diff_stats.resize(num_stats_batches_);
     BatchNormBwd.resize(num_stats_batches_);
     for (int i = 0; i < num_stats_batches_; i++) {
-      InitBatchNormBwdPrimitive(i);
+      InitBatchNormBwdPrimitive(i, inplace);
     }
 
     //bwd_top_diff->set_mkldnn_primitive(BatchNormBwd);     //Wrong passed primitive! (TODO: Checking!)
@@ -502,19 +537,31 @@ void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwd(
 }
 
 template <typename Dtype>
-void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwdPrimitive(int idx) {
+void MKLDNNBatchNormLayer<Dtype>::InitBatchNormBwdPrimitive(int idx, bool inplace) {
     top_diff_stats[idx] = GetStatsBatchMemory<true>(bwd_top_diff, idx);
     bottom_diff_stats[idx] = GetStatsBatchMemory<true>(bwd_bottom_diff, idx);
-
-    if (use_weight_bias_) {
-        BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd,
-                    *input_stats[idx], *mean_memory[idx], *variance_memory[idx],
-                    *top_diff_stats[idx], *scaleshift_memory,
-                    *bottom_diff_stats[idx], *bwd_scaleshift_diff_memory));
+    if (inplace) {
+        if (use_weight_bias_) {
+            BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd,
+                        *input_inplace_buffer[idx], *mean_memory[idx], *variance_memory[idx],
+                        *top_diff_stats[idx], *scaleshift_memory,
+                        *bottom_diff_stats[idx], *bwd_scaleshift_diff_memory));
+        } else {
+            BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd,
+                        *input_inplace_buffer[idx], *mean_memory[idx], *variance_memory[idx],
+                        *top_diff_stats[idx], *bottom_diff_stats[idx]));
+        }
     } else {
-        BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd,
-                    *input_stats[idx], *mean_memory[idx], *variance_memory[idx],
-                    *top_diff_stats[idx], *bottom_diff_stats[idx]));
+        if (use_weight_bias_) {
+            BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd,
+                        *input_stats[idx], *mean_memory[idx], *variance_memory[idx],
+                        *top_diff_stats[idx], *scaleshift_memory,
+                        *bottom_diff_stats[idx], *bwd_scaleshift_diff_memory));
+        } else {
+            BatchNormBwd[idx].reset(new batch_normalization_backward(*BatchNormBwd_pd,
+                        *input_stats[idx], *mean_memory[idx], *variance_memory[idx],
+                        *top_diff_stats[idx], *bottom_diff_stats[idx]));
+        }
     }
 }
 
diff --git a/src/caffe/layers/mkldnn_convolution_layer.cpp b/src/caffe/layers/mkldnn_convolution_layer.cpp
index d65dbf3bf..25cec3361 100644
--- a/src/caffe/layers/mkldnn_convolution_layer.cpp
+++ b/src/caffe/layers/mkldnn_convolution_layer.cpp
@@ -64,7 +64,11 @@ MKLDNNConvolutionLayer<Dtype>::MKLDNNConvolutionLayer(const LayerParameter& para
             , bwdd_top_diff_primitive(NULL), bwdd_weights_data_primitive(NULL)
             , bwdw_top_diff_primitive(NULL), bwdw_bottom_data_primitive(NULL)
             , width_(0), height_(0), width_out_(0), height_out_(0), kernel_w_(0), kernel_h_(0)
-            , stride_w_(0), stride_h_(0), pad_w_(0), pad_h_(0)
+            , stride_w_(0), stride_h_(0), pad_w_(0), pad_h_(0),
+            bwdw_weights_diff_iter(NULL),
+            bwdw_bias_diff_iter(NULL),
+            bwdw_weights_diff_memory_iter(NULL),
+            bwdw_bias_diff_memory_iter(NULL)
 {
   PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
   PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
@@ -117,6 +121,15 @@ void MKLDNNConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& botto
     ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
     init_properties(bottom, top);
     this->bottom_shape_ = &bottom[0]->shape();
+
+    // support for (iter_size > 1) requires additional buffer for weights diff and bias diff
+    // Because Net is initialized before Caffe::set_iter_size, so additional buffer should be new and set here
+    bwdw_weights_diff_iter_blob.reset(new Blob<Dtype>());
+    bwdw_weights_diff_iter_blob->ReshapeLike(*(this->blobs_[0]));
+    if (this->bias_term_) {
+      bwdw_bias_diff_iter_blob.reset(new Blob<Dtype>());
+      bwdw_bias_diff_iter_blob->ReshapeLike(*(this->blobs_[1]));
+    }
 }
 
 template <typename Dtype>
@@ -441,23 +454,53 @@ void MKLDNNConvolutionLayer<Dtype>::InitConvolutionBwd(const vector<Blob<Dtype>*
     bwdw_weights_diff->name = "bwdw_weights_diff  @ " + this->layer_param_.name();
     bwdw_weights_diff_memory = bwdw_weights_diff->create_output_memory();
 
+    if (Caffe::iter_size() > 1) {
+      // support for (iter_size > 1) weights diff requires additional buffer
+      shared_ptr<MemPD> prv_bwdw_weights_diff_memory_iter_pd(new MemPD(convBwdWeights_pd->diff_weights_primitive_desc()));
+      bwdw_weights_diff_iter.reset(new MKLDNNDiff<Dtype>(usr_weights_data_memory_pd, prv_bwdw_weights_diff_memory_iter_pd, bwdw_weights_diff_iter_blob.get(), this));
+      bwdw_weights_diff_memory_iter = bwdw_weights_diff_iter->create_output_memory();
+    }
+
     if (this->bias_term_) {
         shared_ptr<MemPD> prv_bwdw_bias_diff_memory_pd(new MemPD(convBwdWeights_pd->diff_bias_primitive_desc()));
         bwdw_bias_diff.reset(new MKLDNNDiff<Dtype>(usr_bias_data_memory_pd, prv_bwdw_bias_diff_memory_pd, this->blobs_[1].get(), this));
         bwdw_bias_diff->name = "bwdw_bias_diff     @ " + this->layer_param_.name();
         bwdw_bias_diff_memory = bwdw_bias_diff->create_output_memory();
 
-        convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
+        if (Caffe::iter_size() > 1) {
+          // support for (iter_size > 1) bias diff requires additional buffer
+          shared_ptr<MemPD> prv_bwdw_bias_diff_memory_iter_pd(new MemPD(convBwdWeights_pd->diff_bias_primitive_desc()));
+          bwdw_bias_diff_iter.reset(new MKLDNNDiff<Dtype>(usr_bias_data_memory_pd, prv_bwdw_bias_diff_memory_iter_pd, bwdw_bias_diff_iter_blob.get(), this));
+          bwdw_bias_diff_memory_iter = bwdw_bias_diff_iter->create_output_memory();
+          convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
+                        , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
+                        , *bwdw_weights_diff_memory_iter, *bwdw_bias_diff_memory_iter));
+        } else {
+          convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
                         , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
                         , *bwdw_weights_diff_memory, *bwdw_bias_diff_memory));
+        }
 
         //bwdw_bias_diff->set_mkldnn_primitive(convBwdWeights);   //Wrong passed primitive! (For sure!)
         MKLDNNPrimitive<Dtype> bwdw_bias_diff_memory_transfer(bwdw_bias_diff_memory);
         bwdw_bias_diff->set_mkldnn_primitive(bwdw_bias_diff_memory_transfer);
+
+        if (Caffe::iter_size() > 1) {
+          // support for (iter_size > 1) bias diff requires additional buffer
+          MKLDNNPrimitive<Dtype> bwdw_bias_diff_memory_iter_transfer(bwdw_bias_diff_memory_iter);
+          bwdw_bias_diff_iter->set_mkldnn_primitive(bwdw_bias_diff_memory_iter_transfer);
+        }
     } else {
-        convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
+        if (Caffe::iter_size() > 1) {
+          // if (iter_size > 1) then weights diff should be accumulated across iterations
+          convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
+                        , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
+                        , *bwdw_weights_diff_memory_iter));
+        } else {
+          convBwdWeights.reset(new convolution_backward_weights(*convBwdWeights_pd
                         , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
                         , *bwdw_weights_diff_memory));
+        }
     }
 
     convBwdData.reset(new convolution_backward_data(*convBwdData_pd
@@ -489,6 +532,12 @@ void MKLDNNConvolutionLayer<Dtype>::InitConvolutionBwd(const vector<Blob<Dtype>*
     MKLDNNPrimitive<Dtype> bwdw_weights_diff_memory_transfer(bwdw_weights_diff_memory);
     bwdw_weights_diff->set_mkldnn_primitive(bwdw_weights_diff_memory_transfer);
 
+    if (Caffe::iter_size() > 1) {
+      // support for (iter_size > 1) weights diff requires additional buffer
+      MKLDNNPrimitive<Dtype> bwdw_weights_diff_memory_iter_transfer(bwdw_weights_diff_memory_iter);
+      bwdw_weights_diff_iter->set_mkldnn_primitive(bwdw_weights_diff_memory_iter_transfer);
+    }
+
     // Names are for debugging purposes only.
 }
 
@@ -571,6 +620,34 @@ void MKLDNNConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top
         PERFORMANCE_MEASUREMENT_BEGIN();
         convBwdWeights.submit();
         PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_weights_);
+
+        if (Caffe::iter_size() > 1) {
+          // if (iter_size > 1) then weights diff should be accumulated across iterations
+          if (this->blobs_[0]->prv_diff() != NULL) {
+            caffe_axpy(this->blobs_[0]->prv_diff_count(), Dtype(1),
+              (Dtype*)(bwdw_weights_diff_memory_iter->get_data_handle()),
+              this->blobs_[0]->mutable_prv_diff());
+          } else {
+            caffe_axpy(this->blobs_[0]->count(), Dtype(1),
+              (Dtype*)(bwdw_weights_diff_memory_iter->get_data_handle()),
+              this->blobs_[0]->mutable_cpu_diff());
+          }
+        }
+
+        if (this->param_propagate_down(1)) {
+          if (Caffe::iter_size() > 1) {
+            // if (iter_size > 1) then bias diff should be accumulated across iterations
+            if (this->blobs_[1]->prv_diff() != NULL) {
+              caffe_axpy(this->blobs_[1]->prv_diff_count(), Dtype(1),
+                (Dtype*)(bwdw_bias_diff_memory_iter->get_data_handle()),
+                this->blobs_[1]->mutable_prv_diff());
+            } else {
+              caffe_axpy(this->blobs_[1]->count(), Dtype(1),
+                (Dtype*)(bwdw_bias_diff_memory_iter->get_data_handle()),
+                this->blobs_[1]->mutable_cpu_diff());
+            }
+          }
+        }
     }
 }
 
diff --git a/src/caffe/layers/mkldnn_inner_product_layer.cpp b/src/caffe/layers/mkldnn_inner_product_layer.cpp
index 1c92669c1..b25153c51 100644
--- a/src/caffe/layers/mkldnn_inner_product_layer.cpp
+++ b/src/caffe/layers/mkldnn_inner_product_layer.cpp
@@ -84,7 +84,11 @@ MKLDNNInnerProductLayer<Dtype>::MKLDNNInnerProductLayer(
             bwdw_top_diff_primitive(NULL),
             bwdw_bottom_data_primitive(NULL),
             w_(0),
-            h_(0)
+            h_(0),
+            bwdw_weights_diff_iter(NULL),
+            bwdw_bias_diff_iter(NULL),
+            bwdw_weights_diff_memory_iter(NULL),
+            bwdw_bias_diff_memory_iter(NULL)
 {
   PERFORMANCE_EVENT_ID_RESET(perf_id_fw_);
   PERFORMANCE_EVENT_ID_RESET(perf_id_bw_);
@@ -102,6 +106,15 @@ void MKLDNNInnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bott
 {
     VLOG(1) << "MKLDNNInnerProductLayer<Dtype>::LayerSetUp: " << this->layer_param_.name();
     InnerProductLayer<Dtype>::LayerSetUp(bottom, top);
+
+    // support for (iter_size > 1) requires additional buffer for weights diff and bias diff
+    // Because Net is initialized before Caffe::set_iter_size, so additional buffer should be new and set here
+    bwdw_weights_diff_iter_blob.reset(new Blob<Dtype>());
+    bwdw_weights_diff_iter_blob->ReshapeLike(*(this->blobs_[0]));
+    if (this->bias_term_) {
+      bwdw_bias_diff_iter_blob.reset(new Blob<Dtype>());
+      bwdw_bias_diff_iter_blob->ReshapeLike(*(this->blobs_[1]));
+    }
 }
 
 template <typename Dtype>
@@ -403,19 +416,43 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductBwd(const vector<Blob<Dtype
     bwdw_weights_diff->name = "bwdw_weights_diff  @ " + this->layer_param_.name();
     bwdw_weights_diff_memory = bwdw_weights_diff->create_output_memory();
 
+    if (Caffe::iter_size() > 1) {
+      // support for (iter_size > 1) weights diff requires additional buffer
+      shared_ptr<MemPD> prv_bwdw_weights_diff_memory_iter_pd(new MemPD(ipBwdWeights_pd->diff_weights_primitive_desc()));
+      bwdw_weights_diff_iter.reset(new MKLDNNDiff<Dtype>(usr_weights_data_memory_pd, prv_bwdw_weights_diff_memory_iter_pd, bwdw_weights_diff_iter_blob.get(), this));
+      bwdw_weights_diff_memory_iter = bwdw_weights_diff_iter->create_output_memory();
+    }
+
     if (this->bias_term_) {
         shared_ptr<MemPD> prv_bwdw_bias_diff_memory_pd(new MemPD(ipBwdWeights_pd->diff_bias_primitive_desc()));
         bwdw_bias_diff.reset(new MKLDNNDiff<Dtype>(usr_bias_data_memory_pd, prv_bwdw_bias_diff_memory_pd, this->blobs_[1].get(), this));
         bwdw_bias_diff   ->name = "bwdw_bias_diff     @ " + this->layer_param_.name();
         bwdw_bias_diff_memory = bwdw_bias_diff->create_output_memory();
 
-        ipBwdWeights.reset(new inner_product_backward_weights(*ipBwdWeights_pd
+        if (Caffe::iter_size() > 1) {
+          // support for (iter_size > 1) bias diff requires additional buffer
+          shared_ptr<MemPD> prv_bwdw_bias_diff_memory_iter_pd(new MemPD(ipBwdWeights_pd->diff_bias_primitive_desc()));
+          bwdw_bias_diff_iter.reset(new MKLDNNDiff<Dtype>(usr_bias_data_memory_pd, prv_bwdw_bias_diff_memory_iter_pd, bwdw_bias_diff_iter_blob.get(), this));
+          bwdw_bias_diff_memory_iter = bwdw_bias_diff_iter->create_output_memory();
+          ipBwdWeights.reset(new inner_product_backward_weights(*ipBwdWeights_pd
                         , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
+                        , *bwdw_weights_diff_memory_iter, *bwdw_bias_diff_memory_iter));
+        } else {
+          ipBwdWeights.reset(new inner_product_backward_weights(*ipBwdWeights_pd
+                        , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive   
                         , *bwdw_weights_diff_memory, *bwdw_bias_diff_memory));
+        }
     } else {
-        ipBwdWeights.reset(new inner_product_backward_weights(*ipBwdWeights_pd
+        if (Caffe::iter_size() > 1) {
+          // if (iter_size > 1) then weights diff should be accumulated across iterations
+          ipBwdWeights.reset(new inner_product_backward_weights(*ipBwdWeights_pd
+                        , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
+                        , *bwdw_weights_diff_memory_iter));
+        } else {
+          ipBwdWeights.reset(new inner_product_backward_weights(*ipBwdWeights_pd
                         , *bwdw_bottom_data_primitive, *bwdw_top_diff_primitive
                         , *bwdw_weights_diff_memory));
+        }
     }
 
     ipBwdData.reset(new inner_product_backward_data(*ipBwdData_pd
@@ -447,11 +484,23 @@ void MKLDNNInnerProductLayer<Dtype>::InitInnerProductBwd(const vector<Blob<Dtype
     MKLDNNPrimitive<Dtype> bwdw_weights_diff_memory_transfer(bwdw_weights_diff_memory);
     bwdw_weights_diff->set_mkldnn_primitive(bwdw_weights_diff_memory_transfer);
 
+    if (Caffe::iter_size() > 1) {
+      // support for (iter_size > 1) weights diff requires additional buffer
+      MKLDNNPrimitive<Dtype> bwdw_weights_diff_memory_iter_transfer(bwdw_weights_diff_memory_iter);      
+      bwdw_weights_diff_iter->set_mkldnn_primitive(bwdw_weights_diff_memory_iter_transfer);
+    }
+
     if (this->bias_term_)
     {
         //bwdw_bias_diff->set_mkldnn_primitive(ipBwdWeights);   //Wrong passed primitive! (TODO: Checking!)
         MKLDNNPrimitive<Dtype> bwdw_bias_diff_memory_transfer(bwdw_bias_diff_memory);
         bwdw_bias_diff->set_mkldnn_primitive(bwdw_bias_diff_memory_transfer);
+
+        if (Caffe::iter_size() > 1) {
+          // support for (iter_size > 1) bias diff requires additional buffer
+          MKLDNNPrimitive<Dtype> bwdw_bias_diff_memory_iter_transfer(bwdw_bias_diff_memory_iter);
+          bwdw_bias_diff_iter->set_mkldnn_primitive(bwdw_bias_diff_memory_iter_transfer);
+        }
     }
 }
 
@@ -540,6 +589,34 @@ void MKLDNNInnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& to
         PERFORMANCE_MEASUREMENT_BEGIN();
         ipBwdWeights.submit();
         PERFORMANCE_MEASUREMENT_END_ID(perf_id_bw_weights_);
+
+        if (Caffe::iter_size() > 1) {
+          // if (iter_size > 1) then weights diff should be accumulated across iterations
+          if (this->blobs_[0]->prv_diff() != NULL) {
+            caffe_axpy(this->blobs_[0]->prv_diff_count(), Dtype(1),              
+              (Dtype*)(bwdw_weights_diff_memory_iter->get_data_handle()),
+              this->blobs_[0]->mutable_prv_diff());
+          } else {
+            caffe_axpy(this->blobs_[0]->count(), Dtype(1),              
+              (Dtype*)(bwdw_weights_diff_memory_iter->get_data_handle()),
+              this->blobs_[0]->mutable_cpu_diff());
+          }
+        }
+
+        if (this->param_propagate_down(1)) {
+          if (Caffe::iter_size() > 1) {
+            // if (iter_size > 1) then bias diff should be accumulated across iterations
+            if (this->blobs_[1]->prv_diff() != NULL) {
+              caffe_axpy(this->blobs_[1]->prv_diff_count(), Dtype(1),
+                (Dtype*)(bwdw_bias_diff_memory_iter->get_data_handle()),
+                this->blobs_[1]->mutable_prv_diff());
+            } else {
+              caffe_axpy(this->blobs_[1]->count(), Dtype(1),
+                (Dtype*)(bwdw_bias_diff_memory_iter->get_data_handle()),
+                this->blobs_[1]->mutable_cpu_diff());
+            }
+          }
+        }
     }
 }
 
diff --git a/src/caffe/layers/mkldnn_pooling_layer.cpp b/src/caffe/layers/mkldnn_pooling_layer.cpp
index 40f5cf228..94c778238 100644
--- a/src/caffe/layers/mkldnn_pooling_layer.cpp
+++ b/src/caffe/layers/mkldnn_pooling_layer.cpp
@@ -59,18 +59,18 @@ void MKLDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
     PoolingParameter pool_param = this->layer_param_.pooling_param();
 
     if (pool_param.global_pooling()) {
-        CHECK(!(pool_param.has_kernel_size() || pool_param.has_kernel_h() || pool_param.has_kernel_w()))
+        CHECK(!(pool_param.kernel_size_size() || pool_param.has_kernel_h() || pool_param.has_kernel_w()))
             << "With Global_pooling: true Filter size cannot specified";
     } else {
-        CHECK(!pool_param.has_kernel_size() != !(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+        CHECK(!pool_param.kernel_size_size() != !(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
             << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-        CHECK(pool_param.has_kernel_size() ||(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+        CHECK(pool_param.kernel_size_size() ||(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
             << "For non-square filters both kernel_h and kernel_w are required.";
     }
-    CHECK((!pool_param.has_pad() && pool_param.has_pad_h() && pool_param.has_pad_w())
+    CHECK((!pool_param.pad_size() && pool_param.has_pad_h() && pool_param.has_pad_w())
             || (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
         << "pad is pad OR pad_h and pad_w are required.";
-    CHECK((!pool_param.has_stride() && pool_param.has_stride_h() && pool_param.has_stride_w())
+    CHECK((!pool_param.stride_size() && pool_param.has_stride_h() && pool_param.has_stride_w())
             || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
         << "Stride is stride OR stride_h and stride_w are required.";
 
@@ -79,27 +79,56 @@ void MKLDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
         kernel_h_ = bottom[0]->height();
         kernel_w_ = bottom[0]->width();
     } else {
-        if (pool_param.has_kernel_size()) {
-            kernel_h_ = kernel_w_ = pool_param.kernel_size();
+        if (pool_param.kernel_size_size()) {
+            CHECK(pool_param.kernel_size_size() == 1 || pool_param.kernel_size_size() == 2)
+              << "kernel_size must be specified once, or 2 values for Height and Width";
+            if (pool_param.kernel_size_size() == 1) {
+                kernel_h_ = kernel_w_ = pool_param.kernel_size(0);
+            } else {
+                kernel_h_ = pool_param.kernel_size(0);
+                kernel_w_ = pool_param.kernel_size(1);
+            }
         } else {
-            kernel_h_ = pool_param.kernel_h();
-            kernel_w_ = pool_param.kernel_w();
+          kernel_h_ = pool_param.kernel_h();
+          kernel_w_ = pool_param.kernel_w();
         }
     }
+
     CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
     CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+
     if (!pool_param.has_pad_h()) {
-        pad_t_ = pad_b_ = pad_l_ = pad_r_ = pool_param.pad();
+        CHECK(pool_param.pad_size() < 3)
+          << "pad must be specified no more than 3 dimensions";
+        if (pool_param.pad_size() == 0) {
+          pad_t_ = pad_b_ = pad_l_ = pad_r_ = 0;
+        } else if (pool_param.pad_size() == 1) {
+          pad_t_ = pad_b_ = pad_l_ = pad_r_ = pool_param.pad(0);
+        } else {
+          pad_t_ = pad_b_ = pool_param.pad(0);
+          pad_l_ = pad_r_ = pool_param.pad(1);
+        }
     } else {
         pad_t_ = pad_b_ = pool_param.pad_h();
         pad_l_ = pad_r_ = pool_param.pad_w();
     }
+
     if (!pool_param.has_stride_h()) {
-        stride_h_ = stride_w_ = pool_param.stride();
+        CHECK(pool_param.stride_size() < 3)
+          << "stride must be specified no more than 3 dimensions";
+        if (pool_param.stride_size() == 0) {
+          stride_h_ = stride_w_ = 1;
+        } else if (pool_param.stride_size() == 1) {
+          stride_h_ = stride_w_ = pool_param.stride(0);
+        } else {
+          stride_h_ = pool_param.stride(0);
+          stride_w_ = pool_param.stride(1);
+        }
     } else {
         stride_h_ = pool_param.stride_h();
         stride_w_ = pool_param.stride_w();
     }
+
     if (global_pooling_) {
         CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1)
             << "With Global_pooling: true; only pad = 0 and stride = 1";
@@ -117,7 +146,7 @@ void MKLDNNPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom
     width_out_ = static_cast<int>(ceil(static_cast<float>(
         bottom[0]->width() + pad_r_ + pad_l_ - kernel_w_) / stride_w_)) + 1;
 
-    if (pad_t_ || pad_b_ || pad_r_ || pad_l_) {
+    if (pad_t_ || pad_b_ || pad_r_ || pad_l_ || kernel_h_ == 1 || kernel_w_ == 1) {
         // If we have padding, ensure that the last pooling starts strictly
         // inside the image (instead of at the padding); otherwise clip the last.
         if ((height_out_ - 1) * stride_h_ >= bottom[0]->height() + pad_t_) {
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 1f4f342ad..f4b0a6294 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -39,120 +39,355 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cfloat>
 #include <vector>
 
-#include "caffe/layers/pooling_layer.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/syncedmem.hpp"
 #include "caffe/util/math_functions.hpp"
 
+#include "caffe/layers/pooling_layer.hpp"
+
 namespace caffe {
 
+using std::min;
+using std::max;
+
 template <typename Dtype>
 void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   PoolingParameter pool_param = this->layer_param_.pooling_param();
-  if (pool_param.global_pooling()) {
-    CHECK(!(pool_param.has_kernel_size() ||
-      pool_param.has_kernel_h() || pool_param.has_kernel_w()))
-      << "With Global_pooling: true Filter size cannot specified";
-  } else {
-    CHECK(!pool_param.has_kernel_size() !=
-      !(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
-      << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
-    CHECK(pool_param.has_kernel_size() ||
-      (pool_param.has_kernel_h() && pool_param.has_kernel_w()))
-      << "For non-square filters both kernel_h and kernel_w are required.";
-  }
-  CHECK((!pool_param.has_pad() && pool_param.has_pad_h()
-      && pool_param.has_pad_w())
-      || (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
-      << "pad is pad OR pad_h and pad_w are required.";
-  CHECK((!pool_param.has_stride() && pool_param.has_stride_h()
-      && pool_param.has_stride_w())
-      || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
-      << "Stride is stride OR stride_h and stride_w are required.";
-  global_pooling_ = pool_param.global_pooling();
-  if (global_pooling_) {
-    kernel_h_ = bottom[0]->height();
-    kernel_w_ = bottom[0]->width();
-  } else {
-    if (pool_param.has_kernel_size()) {
-      kernel_h_ = kernel_w_ = pool_param.kernel_size();
+  // find channel axis and compute spatial axes constants
+  channel_axis_ = bottom[0]->CanonicalAxisIndex(pool_param.axis());
+  channels_ = bottom[0]->shape(channel_axis_);
+  const int first_spatial_axis = channel_axis_ + 1;
+  const int num_axes = bottom[0]->num_axes();
+  num_spatial_axes_ = num_axes - first_spatial_axis;
+  CHECK_GE(num_spatial_axes_, 1);
+
+  if (num_spatial_axes_ == 2) {
+      // Process 2D Pooling
+      if (pool_param.global_pooling()) {
+        CHECK(!(pool_param.kernel_size_size() ||
+          pool_param.has_kernel_h() || pool_param.has_kernel_w()))
+          << "With Global_pooling: true Filter size cannot specified";
+      } else {
+        CHECK(!pool_param.kernel_size_size() !=
+          !(pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+          << "Filter size is kernel_size OR kernel_h and kernel_w; not both";
+        CHECK(pool_param.kernel_size_size() ||
+          (pool_param.has_kernel_h() && pool_param.has_kernel_w()))
+          << "For non-square filters both kernel_h and kernel_w are required.";
+      }
+      CHECK((!pool_param.pad_size() && pool_param.has_pad_h()
+          && pool_param.has_pad_w())
+          || (!pool_param.has_pad_h() && !pool_param.has_pad_w()))
+          << "pad is pad OR pad_h and pad_w are required.";
+      CHECK((!pool_param.stride_size() && pool_param.has_stride_h()
+          && pool_param.has_stride_w())
+          || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
+          << "Stride is stride OR stride_h and stride_w are required.";
+      global_pooling_ = pool_param.global_pooling();
+      if (global_pooling_) {
+        kernel_h_ = bottom[0]->height();
+        kernel_w_ = bottom[0]->width();
+      } else {
+        if (pool_param.kernel_size_size()) {
+          CHECK(pool_param.kernel_size_size() == 1 || pool_param.kernel_size_size() == 2)
+              << "kernel_size must be specified once, or 2 values for Height and Width";
+          if (pool_param.kernel_size_size() == 1) {
+              kernel_h_ = kernel_w_ = pool_param.kernel_size(0);
+          } else {
+              kernel_h_ = pool_param.kernel_size(0);
+              kernel_w_ = pool_param.kernel_size(1);
+          }
+        } else {
+          kernel_h_ = pool_param.kernel_h();
+          kernel_w_ = pool_param.kernel_w();
+        }
+      }
+      CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
+      CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
+      if (pool_param.pad_size() > 0) {
+          CHECK(pool_param.pad_size() == 1 || pool_param.pad_size() == 2)
+              << "pad must be specified once, or 2 values for Height and Width";
+          if (pool_param.pad_size() == 1) {
+              pad_h_ = pad_w_ = pool_param.pad(0);
+          } else {
+              pad_h_ = pool_param.pad(0);
+              pad_w_ = pool_param.pad(1);
+          }
+        } else {
+        pad_h_ = pool_param.pad_h();
+        pad_w_ = pool_param.pad_w();
+      }
+      if (pool_param.stride_size() > 0) {
+          CHECK(pool_param.stride_size() == 1 || pool_param.stride_size() == 2)
+              << "stride must be specified once, or 2 values for Height and Width";
+          if (pool_param.stride_size() == 1) {
+              stride_h_ = stride_w_ = pool_param.stride(0);
+          } else {
+              stride_h_ = pool_param.stride(0);
+              stride_w_ = pool_param.stride(1);
+          }
+        } else {
+        stride_h_ = pool_param.stride_h();
+        stride_w_ = pool_param.stride_w();
+      }
+      if (global_pooling_) {
+        CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
+          << "With Global_pooling: true; only pad = 0 and stride = 1";
+      }
+      if (pad_h_ != 0 || pad_w_ != 0) {
+        CHECK(this->layer_param_.pooling_param().pool()
+            == PoolingParameter_PoolMethod_AVE
+            || this->layer_param_.pooling_param().pool()
+            == PoolingParameter_PoolMethod_MAX)
+            << "Padding implemented only for average and max pooling.";
+        CHECK_LT(pad_h_, kernel_h_);
+        CHECK_LT(pad_w_, kernel_w_);
+      }
+    } else if (num_spatial_axes_ == 3) {
+      // Process 3D Pooling
+      // Setup input dimensions (input_shape_).
+      vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
+
+      // LOG(INFO) << "channel_axis_: " << channel_axis_ << "  channels_: " << channels_ << " num_axes: " << num_axes;
+      input_shape_.Reshape(bottom_dim_blob_shape);
+
+      int* input_shape_data = input_shape_.mutable_cpu_data();
+      for (int i = 0; i < num_spatial_axes_ + 1; ++i) {
+        input_shape_data[i] = bottom[0]->shape(channel_axis_ + i);
+      }
+      vector<int> spatial_dim_blob_shape(1, num_spatial_axes_);
+
+      global_pooling_ = pool_param.global_pooling();
+      // Setup filter kernel dimensions (kernel_shape_).
+      kernel_shape_.Reshape(spatial_dim_blob_shape);
+      int* kernel_shape_data = kernel_shape_.mutable_cpu_data();
+      if (global_pooling_) {
+        // if global pooling height and width are set the entire blob,
+        // and the layer cannot have a kernel set
+        CHECK_GE(0, pool_param.kernel_size_size())
+            << "With Global_pooling: true Filter size cannot specified.";
+        CHECK(!pool_param.has_kernel_h() || !pool_param.has_kernel_w())
+            << "With Global_pooling: true Filter size cannot specified.";
+        for (int i = 0; i < num_spatial_axes_ + 1; ++i) {
+          kernel_shape_data[i] = bottom[0]->shape(channel_axis_ + i);
+        }
+      } else {
+         // if kernel_h or kernel_w are set we cannot set the kernel another way
+         // And there must be 2 spatial dims
+        if (pool_param.has_kernel_h() || pool_param.has_kernel_w()) {
+            CHECK_EQ(num_spatial_axes_, 2)
+              << "kernel_h & kernel_w can only be used for 2D pooling.";
+            CHECK_EQ(0, pool_param.kernel_size_size())
+              << "Either kernel_size or kernel_h/w should be specified, not both.";
+            kernel_shape_data[0] = pool_param.kernel_h();
+            kernel_shape_data[1] = pool_param.kernel_w();
+        } else {
+            // using repeated kernel param
+            const int num_kernel_dims = pool_param.kernel_size_size();
+            CHECK(num_kernel_dims == 1 || num_kernel_dims == num_spatial_axes_)
+              << "kernel_size must be specified once, or once per spatial dimension"
+              << " (kernel_size specified " << num_kernel_dims << " times "
+              << num_spatial_axes_ << " spatial dims).";
+            for (int i = 0; i < num_spatial_axes_; ++i) {
+                kernel_shape_data[i] = pool_param.kernel_size((num_kernel_dims == 1) ? 0 : i);
+            }
+        }
+      }
+
+      for (int i = 0; i < num_spatial_axes_; ++i) {
+          CHECK_GT(kernel_shape_data[i], 0) << "Filter dimensions must be nonzero.";
+      }
+
+      // setup padding dimensions (pad_)
+      pad_.Reshape(spatial_dim_blob_shape);
+      int* pad_data = pad_.mutable_cpu_data();
+      int pad_sum = 0;
+      if (pool_param.has_pad_h() || pool_param.has_pad_w()) {
+          // if pad_h or pad_w are set we cannot set the pad another way
+          // And there must be 2 spatial dims
+          CHECK_EQ(num_spatial_axes_, 2)
+            << "pad_h & pad_w can only be used for 2D convolution.";
+          CHECK_EQ(0, pool_param.pad_size())
+            << "Either pad or pad_h/w should be specified, not both.";
+          pad_data[0] = pool_param.pad_h();
+          pad_data[1] = pool_param.pad_w();
+      } else {
+        // using repeated pad param
+        const int num_pad_dims = pool_param.pad_size();
+        CHECK(num_pad_dims == 0 || num_pad_dims == 1 ||
+              num_pad_dims == num_spatial_axes_)
+            << "pad must be specified once, or once per spatial dimension "
+            << "(pad specified " << num_pad_dims << " times "
+            << num_spatial_axes_ << " spatial dims).";
+        const int kDefaultPad = 0;
+        for (int i = 0; i < num_spatial_axes_; ++i) {
+          pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
+              pool_param.pad((num_pad_dims == 1) ? 0 : i);
+          if (global_pooling_) {
+              CHECK_EQ(pad_data[i], 0)
+                << "With Global_pooling: true; pool = 0";
+            }
+          CHECK_LT(pad_data[i], kernel_shape_data[i]);
+          pad_sum += pad_data[i];
+        }
+      }
+
+      if (pad_sum != 0) {
+         CHECK(this->layer_param_.pooling_param().pool() ==
+          PoolingParameter_PoolMethod_AVE
+          || this->layer_param_.pooling_param().pool() ==
+          PoolingParameter_PoolMethod_MAX)
+            << "Padding implemented only for average and max pooling.";
+      }
+
+      // Setup stride dimensions (stride_).
+      stride_.Reshape(spatial_dim_blob_shape);
+      int* stride_data = stride_.mutable_cpu_data();
+      if (pool_param.has_stride_h() || pool_param.has_stride_w()) {
+        CHECK_EQ(num_spatial_axes_, 2)
+            << "stride_h & stride_w can only be used for 2D convolution.";
+        CHECK_EQ(0, pool_param.stride_size())
+            << "Either stride or stride_h/w should be specified, not both.";
+        stride_data[0] = pool_param.stride_h();
+        stride_data[1] = pool_param.stride_w();
+      } else {
+        // using repeated stride param
+        const int num_stride_dims = pool_param.stride_size();
+        CHECK(num_stride_dims == 0 || num_stride_dims == 1 ||
+              num_stride_dims == num_spatial_axes_)
+            << "stride must be specified once, or once per spatial dimension "
+            << "(stride specified " << num_stride_dims << " times "
+            << num_spatial_axes_ << " spatial dims).";
+        const int kDefaultStride = 1;
+        for (int i = 0; i < num_spatial_axes_; ++i) {
+          stride_data[i] = (num_stride_dims == 0) ? kDefaultStride :
+              pool_param.stride((num_stride_dims == 1) ? 0 : i);
+          CHECK_GT(stride_data[i], 0) << "Stride dimensions must be nonzero.";
+          if (global_pooling_) {
+            CHECK_EQ(stride_data[i], 1)
+              << "With Global_pooling: true; stride = 1";
+          }
+        }
+      }
     } else {
-      kernel_h_ = pool_param.kernel_h();
-      kernel_w_ = pool_param.kernel_w();
+      NOT_IMPLEMENTED;
     }
-  }
-  CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero.";
-  CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero.";
-  if (!pool_param.has_pad_h()) {
-    pad_h_ = pad_w_ = pool_param.pad();
-  } else {
-    pad_h_ = pool_param.pad_h();
-    pad_w_ = pool_param.pad_w();
-  }
-  if (!pool_param.has_stride_h()) {
-    stride_h_ = stride_w_ = pool_param.stride();
-  } else {
-    stride_h_ = pool_param.stride_h();
-    stride_w_ = pool_param.stride_w();
-  }
-  if (global_pooling_) {
-    CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1)
-      << "With Global_pooling: true; only pad = 0 and stride = 1";
-  }
-  if (pad_h_ != 0 || pad_w_ != 0) {
-    CHECK(this->layer_param_.pooling_param().pool()
-        == PoolingParameter_PoolMethod_AVE
-        || this->layer_param_.pooling_param().pool()
-        == PoolingParameter_PoolMethod_MAX)
-        << "Padding implemented only for average and max pooling.";
-    CHECK_LT(pad_h_, kernel_h_);
-    CHECK_LT(pad_w_, kernel_w_);
-  }
 }
 
 template <typename Dtype>
 void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
-  CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
-      << "corresponding to (num, channels, height, width)";
-  channels_ = bottom[0]->channels();
-  height_ = bottom[0]->height();
-  width_ = bottom[0]->width();
-  if (global_pooling_) {
-    kernel_h_ = bottom[0]->height();
-    kernel_w_ = bottom[0]->width();
-  }
-  pooled_height_ = static_cast<int>(ceil(static_cast<float>(
-      height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
-  pooled_width_ = static_cast<int>(ceil(static_cast<float>(
-      width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
-  if (pad_h_ || pad_w_) {
-    // If we have padding, ensure that the last pooling starts strictly
-    // inside the image (instead of at the padding); otherwise clip the last.
-    if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) {
-      --pooled_height_;
-    }
-    if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) {
-      --pooled_width_;
-    }
-    CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_);
-    CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_);
-  }
-  top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_,
-      pooled_width_);
-  if (top.size() > 1) {
-    top[1]->ReshapeLike(*top[0]);
-  }
-  // If max pooling, we will initialize the vector index part.
-  if (this->layer_param_.pooling_param().pool() ==
-      PoolingParameter_PoolMethod_MAX && top.size() == 1) {
-    max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-        pooled_width_);
-  }
-  // If stochastic pooling, we will initialize the random index part.
-  if (this->layer_param_.pooling_param().pool() ==
-      PoolingParameter_PoolMethod_STOCHASTIC) {
-    rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
-      pooled_width_);
+  PoolingParameter pool_param = this->layer_param_.pooling_param();
+  channel_axis_ = bottom[0]->CanonicalAxisIndex(pool_param.axis());
+  num_ = bottom[0]->count(0, channel_axis_);
+  const int first_spatial_axis = channel_axis_ + 1;
+  const int num_axes = bottom[0]->num_axes();
+  num_spatial_axes_ = num_axes - first_spatial_axis;
+  CHECK_GE(num_spatial_axes_, 1);
+
+  if (num_spatial_axes_ == 2) {
+      // Process 2D Pooling
+      CHECK_EQ(4, bottom[0]->num_axes()) << "Input must have 4 axes, "
+          << "corresponding to (num, channels, height, width).";
+      channels_ = bottom[0]->channels();
+      height_ = bottom[0]->height();
+      width_ = bottom[0]->width();
+      if (global_pooling_) {
+        kernel_h_ = bottom[0]->height();
+        kernel_w_ = bottom[0]->width();
+      }
+      pooled_height_ = static_cast<int>(ceil(static_cast<float>(
+          height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
+      pooled_width_ = static_cast<int>(ceil(static_cast<float>(
+          width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+      if (pad_h_ || pad_w_ || kernel_h_ == 1 || kernel_w_ == 1) {
+        // If we have padding, ensure that the last pooling starts strictly
+        // inside the image (instead of at the padding); otherwise clip the last.
+        if ((pooled_height_ - 1) * stride_h_ >= height_ + pad_h_) {
+          --pooled_height_;
+        }
+        if ((pooled_width_ - 1) * stride_w_ >= width_ + pad_w_) {
+          --pooled_width_;
+        }
+        CHECK_LT((pooled_height_ - 1) * stride_h_, height_ + pad_h_);
+        CHECK_LT((pooled_width_ - 1) * stride_w_, width_ + pad_w_);
+      }
+      top[0]->Reshape(bottom[0]->num(), channels_, pooled_height_,
+          pooled_width_);
+      if (top.size() > 1) {
+        top[1]->ReshapeLike(*top[0]);
+      }
+      // If max pooling, we will initialize the vector index part.
+      if (this->layer_param_.pooling_param().pool() ==
+          PoolingParameter_PoolMethod_MAX && top.size() == 1) {
+        max_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_,
+            pooled_width_);
+      }
+      // If stochastic pooling, we will initialize the random index part.
+      if (this->layer_param_.pooling_param().pool() ==
+          PoolingParameter_PoolMethod_STOCHASTIC) {
+        rand_idx_.Reshape(bottom[0]->num(), channels_, pooled_height_, pooled_width_);
+      }
+  } else if (num_spatial_axes_ == 3) {
+      /* Process 3D Pooling */
+      int* kernel_shape_data = kernel_shape_.mutable_cpu_data();
+      const int* input_shape_data = this->input_shape_.cpu_data();
+      if (global_pooling_) {
+        for (int i = 0; i < num_spatial_axes_; ++i) {
+          kernel_shape_data[i] = input_shape_data[i+1];
+        }
+      }
+      // compute output shape
+      const int* pad_data = this->pad_.cpu_data();
+      const int* stride_data = this->stride_.cpu_data();
+      vector<int> spatial_dim_blob_shape(1, num_spatial_axes_);
+      output_shape_.Reshape(spatial_dim_blob_shape);
+      int* output_shape_data = output_shape_.mutable_cpu_data();
+      int pad_sum = 0;
+      for (int i = 0; i < num_spatial_axes_; ++i) {
+        int oc = static_cast<int>(ceil(static_cast<float>(
+              input_shape_data[i+1] + 2 * pad_data[i]
+              - kernel_shape_data[i]) / stride_data[i])) + 1;
+        pad_sum += pad_data[i];
+        output_shape_data[i] = oc;
+      }
+
+      if (pad_sum) {
+        for (int i = 0; i < num_spatial_axes_; ++i) {
+            if ((output_shape_data[i] - 1) * stride_data[i] >=
+              input_shape_data[i+1] + pad_data[i])
+                --output_shape_data[i];
+            CHECK_LT((output_shape_data[i] - 1) * stride_data[i],
+              input_shape_data[i+1] + pad_data[i]);
+        }
+      }
+
+      vector<int> top_shape = bottom[0]->shape();
+      // Discard input spatial axes
+      top_shape.resize(first_spatial_axis);
+      for (int i = 0; i < num_spatial_axes_; ++i) {
+          top_shape.push_back(output_shape_data[i]);
+      }
+
+      top[0]->Reshape(top_shape);
+      if (top.size() > 1) {
+        top[1]->ReshapeLike(*top[0]);
+      }
+
+      // If max pooling, we will initialize the vector index part.
+      if (this->layer_param_.pooling_param().pool() ==
+          PoolingParameter_PoolMethod_MAX && top.size() == 1) {
+        max_idx_.Reshape(top_shape);
+      }
+      // If stochastic pooling, we will initialize the random index part.
+      if (this->layer_param_.pooling_param().pool() ==
+          PoolingParameter_PoolMethod_STOCHASTIC) {
+        rand_idx_.Reshape(top_shape);
+      }
+  } else {
+    NOT_IMPLEMENTED;
   }
 }
 
@@ -167,35 +402,174 @@ void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // We'll output the mask to top[1] if it's of size >1.
   const bool use_top_mask = top.size() > 1;
 
-  typename PoolingCodeGeneratorForward<Dtype>::Callback_t* generator_func =
-           Forward_code_generator.Get_callback(this, top[0], use_top_mask);
-  // We are getting top_mask here as mutable_cpu_data is not thread safe
-  // and doing it inside parallel region creates of risk of race condition
-  void* mask = NULL;
-  if (this->layer_param_.pooling_param().pool() ==
-      PoolingParameter_PoolMethod_MAX ) {
-    mask = (use_top_mask) ? static_cast<void*>(top[1]->mutable_cpu_data()) :
-                            static_cast<void*>(max_idx_.mutable_cpu_data());
-  }
+  if (num_spatial_axes_ == 2) {
+      /* Process 2D Pooling */
+      typename PoolingCodeGeneratorForward<Dtype>::Callback_t* generator_func =
+               Forward_code_generator.Get_callback(this, top[0], use_top_mask);
+      // We are getting top_mask here as mutable_cpu_data is not thread safe
+      // and doing it inside parallel region creates of risk of race condition
+      void* mask = NULL;
+      if (this->layer_param_.pooling_param().pool() ==
+          PoolingParameter_PoolMethod_MAX ) {
+        mask = (use_top_mask) ? static_cast<void*>(top[1]->mutable_cpu_data()) :
+                                static_cast<void*>(max_idx_.mutable_cpu_data());
+      }
+
+      const int batch_size = bottom[0]->num();
+      const int num_channels = bottom[0]->channels();
+
+#ifdef _OPENMP
+      #pragma omp parallel for collapse(2)
+#endif
+      for (int image = 0; image < batch_size; ++image)
+        for (int channel = 0; channel < num_channels; ++channel)
+          generator_func(bottom_data,
+                         top_data,
+                         top_count,
+                         image,
+                         image+1,
+                         mask,
+                         channel,
+                         channel+1,
+                         this,
+                         use_top_mask);
+  } else if (num_spatial_axes_ == 3) {
+      // Process 3D Pooling
+      vector<int> offset(2, 0);
+      offset[1] = 1;
 
-  const int batch_size = bottom[0]->num();
-  const int num_channels = bottom[0]->channels();
+      const int* kernel_shape = kernel_shape_.cpu_data();
+      const int* pad_data = this->pad_.cpu_data();
+      const int* stride_data = this->stride_.cpu_data();
+      const int* input_shape_data = this->input_shape_.cpu_data();
+      const int* output_shape_data = this->output_shape_.cpu_data();
 
+      long bottom_offset = bottom[0]->offset(offset);
+      long top_offset = top[0]->offset(offset);
+
+      // Different pooling methods. We explicitly do the switch outside the for
+      // loop to save time, although this results in more code.
+      switch (this->layer_param_.pooling_param().pool()) {
+      case PoolingParameter_PoolMethod_MAX:
+        if (use_top_mask) {
+          caffe_set(top_count, Dtype(-1), top[1]->mutable_cpu_data());
+        } else {
+          caffe_set(top_count, -1, max_idx_.mutable_cpu_data());
+        }
+        caffe_set(top_count, Dtype(-FLT_MAX), top_data);
+
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2)
+#endif
+        for (int n = 0; n < num_; ++n) {
+          for (int c = 0; c < channels_; ++c) {
+            long nc = n * channels_ + c;
+            const Dtype *bottom_data2 = bottom[0]->cpu_data() + nc * bottom_offset;
+            Dtype *top_data2 = top[0]->mutable_cpu_data() + nc * top_offset;
+            Dtype *top_mask = NULL;
+            int *mask = NULL;
+            if (use_top_mask) {
+              top_mask = top[1]->mutable_cpu_data() + nc * top_offset;
+            } else {
+              mask = max_idx_.mutable_cpu_data() + nc * top_offset;
+            }
+
+            for (int pz = 0; pz < output_shape_data[0]; ++pz) {
+              for (int ph = 0; ph < output_shape_data[1]; ++ph) {
+                for (int pw = 0; pw < output_shape_data[2]; ++pw) {
+                    int zstart = pz * stride_data[0] - pad_data[0];
+                    int hstart = ph * stride_data[1] - pad_data[1];
+                    int wstart = pw * stride_data[2] - pad_data[2];
+
+                    int zend = min(zstart + kernel_shape[0], input_shape_data[1]);
+                    int hend = min(hstart + kernel_shape[1], input_shape_data[2]);
+                    int wend = min(wstart + kernel_shape[2], input_shape_data[3]);
+ 
+                    hstart = max(hstart, 0);
+                    wstart = max(wstart, 0);
+                    zstart = max(zstart, 0);
+                    const int pool_index = (pz * output_shape_data[1] + ph) * output_shape_data[2] + pw;
+                    for (int z = zstart; z < zend; ++z) {
+                      for (int h = hstart; h < hend; ++h) {
+                        for (int w = wstart; w < wend; ++w) {
+                          const int index = (z * input_shape_data[2] + h) * input_shape_data[3] + w;
+                          if (bottom_data2[index] > top_data2[pool_index]) {
+                            top_data2[pool_index] = bottom_data2[index];
+                            if (use_top_mask) {
+                              top_mask[pool_index] = static_cast<Dtype>(index);
+                            } else {
+                              mask[pool_index] = index;
+                            }
+                          }
+                        }
+                      }
+                    }
+                }
+              }
+            }
+          }
+        }
+        break;
+      case PoolingParameter_PoolMethod_AVE:
+        caffe_set(top_count, Dtype(0), top_data);
 #ifdef _OPENMP
   #pragma omp parallel for collapse(2)
 #endif
-  for (int image = 0; image < batch_size; ++image)
-    for (int channel = 0; channel < num_channels; ++channel)
-      generator_func(bottom_data,
-                     top_data,
-                     top_count,
-                     image,
-                     image+1,
-                     mask,
-                     channel,
-                     channel+1,
-                     this,
-                     use_top_mask);
+        for (int n = 0; n < num_; ++n) {
+          for (int c = 0; c < channels_; ++c) {
+            long nc = n * channels_ + c;
+            const Dtype *bottom_data2 = bottom[0]->cpu_data() + nc * bottom_offset;
+            Dtype *top_data2 = top[0]->mutable_cpu_data() + nc * top_offset;
+
+            for (int pz = 0; pz < output_shape_data[0]; ++pz) {
+              for (int ph = 0; ph < output_shape_data[1]; ++ph) {
+                for (int pw = 0; pw < output_shape_data[2]; ++pw) {
+                  int zstart = pz * stride_data[0] - pad_data[0];
+                  int hstart = ph * stride_data[1] - pad_data[1];
+                  int wstart = pw * stride_data[2] - pad_data[2];
+
+                  int zend = min(zstart + kernel_shape[0],
+                           input_shape_data[1] + pad_data[0]);
+                  int hend = min(hstart + kernel_shape[1],
+                            input_shape_data[2] + pad_data[1]);
+                  int wend = min(wstart + kernel_shape[2],
+                          input_shape_data[3] + pad_data[2]);
+
+                  int pool_size = (hend - hstart) *
+                                  (wend - wstart) *
+                                  (zend - zstart);
+                  hstart = max(hstart, 0);
+                  wstart = max(wstart, 0);
+                  zstart = max(zstart, 0);
+                  zend = min(zend, input_shape_data[1]);
+                  hend = min(hend, input_shape_data[2]);
+                  wend = min(wend, input_shape_data[3]);
+
+                  const int pool_index = (pz * output_shape_data[1] + ph) * output_shape_data[2] + pw;
+                  for (int z = zstart; z < zend; ++z) {
+                    for (int h = hstart; h < hend; ++h) {
+                      for (int w = wstart; w < wend; ++w) {
+                        const int index = (z * input_shape_data[2] + h) * input_shape_data[3] + w;
+                        top_data2[pool_index] += bottom_data2[index];
+                      }
+                    }
+                  }
+                  top_data2[pool_index] /= pool_size;
+                }
+              }
+            }
+          }
+        }
+        break;
+      case PoolingParameter_PoolMethod_STOCHASTIC:
+        NOT_IMPLEMENTED;
+        break;
+      default:
+        LOG(FATAL) << "Unknown pooling method.";
+      }
+    } else {
+      NOT_IMPLEMENTED;
+    }
 }
 
 template <typename Dtype>
@@ -210,35 +584,133 @@ void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   // We'll output the mask to top[1] if it's of size >1.
   const bool use_top_mask = top.size() > 1;
 
-  typename PoolingCodeGeneratorBackward<Dtype>::Callback_t* generator_func =
+  if (num_spatial_axes_ == 2) {
+      // Process 2D pooling
+      typename PoolingCodeGeneratorBackward<Dtype>::Callback_t* generator_func =
                           Backward_code_generator.Get_callback(this, top[0]);
 
-  // We are getting top_mask here as mutable_cpu_data is not thread safe
-  // and doing it inside parallel region creates of risk of race condition
-  void* mask = NULL;
-  if (this->layer_param_.pooling_param().pool() ==
-      PoolingParameter_PoolMethod_MAX ) {
-    mask = (use_top_mask) ? static_cast<void*>(top[1]->mutable_cpu_data()) :
-                            static_cast<void*>(max_idx_.mutable_cpu_data());
-  }
+      // We are getting top_mask here as mutable_cpu_data is not thread safe
+      // and doing it inside parallel region creates of risk of race condition
+      void* mask = NULL;
+      if (this->layer_param_.pooling_param().pool() ==
+          PoolingParameter_PoolMethod_MAX ) {
+        mask = (use_top_mask) ? static_cast<void*>(top[1]->mutable_cpu_data()) :
+                                static_cast<void*>(max_idx_.mutable_cpu_data());
+      }
+
+      const int batch_size = bottom[0]->num();
+      const int num_channels = bottom[0]->channels();
+
+#ifdef _OPENMP
+      #pragma omp parallel for collapse(2)
+#endif
+      for (int image = 0; image < batch_size; ++image)
+        for (int channel = 0; channel < num_channels; ++channel)
+          generator_func(top_diff,
+                         bottom_diff,
+                         image,
+                         image+1,
+                         channel,
+                         channel+1,
+                         use_top_mask,
+                         mask,
+                         this);
+      } else if (num_spatial_axes_ == 3) {
+        /* Process 3D pooling */
+        const int* kernel_shape = this->kernel_shape_.cpu_data();
+        const int* pad_data = this->pad_.cpu_data();
+        const int* stride_data = this->stride_.cpu_data();
+        const int* input_shape_data = this->input_shape_.cpu_data();
+        const int* output_shape_data = this->output_shape_.cpu_data();
+        int top_num = top[0]->count(0, channel_axis_);
+        vector<int> offset(2, 0);
+        offset[1] = 1;
 
-  const int batch_size = bottom[0]->num();
-  const int num_channels = bottom[0]->channels();
+        long bottom_offset = bottom[0]->offset(offset);
+        long top_offset = top[0]->offset(offset);
 
+        switch (this->layer_param_.pooling_param().pool()) {
+        case PoolingParameter_PoolMethod_MAX:
+#ifdef _OPENMP
+      #pragma omp parallel for collapse(2)
+#endif
+          for (int n = 0; n < top_num; ++n) {
+            for (int c = 0; c < channels_; ++c) {
+              long nc = n * channels_ + c;
+              Dtype *bottom_diff2 = bottom[0]->mutable_cpu_diff() + nc * bottom_offset;
+              const Dtype *top_diff2 = top[0]->cpu_diff() + nc * top_offset;
+              const Dtype *top_mask = NULL;
+              const int *mask = NULL;
+              if (use_top_mask) {
+                top_mask = top[1]->cpu_data() + nc * top_offset;
+              } else {
+                mask = max_idx_.cpu_data() + nc * top_offset;
+              }
+ 
+              for (int pz = 0; pz < output_shape_data[0]; ++pz) {
+                for (int ph = 0; ph < output_shape_data[1]; ++ph) {
+                  for (int pw = 0; pw < output_shape_data[2]; ++pw) {
+                    const int index = (pz * output_shape_data[1] + ph) * output_shape_data[2] + pw;
+                    const int bottom_index = use_top_mask ? top_mask[index] : mask[index];
+                    bottom_diff2[bottom_index] += top_diff2[index];
+                  }
+                }
+              }
+            }
+          }
+          break;
+        case PoolingParameter_PoolMethod_AVE:
 #ifdef _OPENMP
   #pragma omp parallel for collapse(2)
 #endif
-  for (int image = 0; image < batch_size; ++image)
-    for (int channel = 0; channel < num_channels; ++channel)
-      generator_func(top_diff,
-                     bottom_diff,
-                     image,
-                     image+1,
-                     channel,
-                     channel+1,
-                     use_top_mask,
-                     mask,
-                     this);
+          for (int n = 0; n < top_num; ++n) {
+            for (int c = 0; c < channels_; ++c) {
+              long nc = n * channels_ + c;
+              Dtype *bottom_diff2 = bottom[0]->mutable_cpu_diff() + nc * bottom_offset;
+              const Dtype *top_diff2 = top[0]->cpu_diff() + nc * top_offset;
+
+              for (int pz = 0; pz < output_shape_data[0]; ++pz) {
+                for (int ph = 0; ph < output_shape_data[1]; ++ph) {
+                  for (int pw = 0; pw < output_shape_data[2]; ++pw) {
+                    int zstart = pz * stride_data[0] - pad_data[0];
+                    int hstart = ph * stride_data[1] - pad_data[1];
+                    int wstart = pw * stride_data[2] - pad_data[2];
+
+                    int zend = min(zstart + kernel_shape[0], input_shape_data[1] + pad_data[0]);
+                    int hend = min(hstart + kernel_shape[1], input_shape_data[2] + pad_data[1]);
+                    int wend = min(wstart + kernel_shape[2], input_shape_data[3] + pad_data[2]);
+                    int pool_size = (hend - hstart) * (wend - wstart) * (zend - zstart);
+                    hstart = max(hstart, 0);
+                    wstart = max(wstart, 0);
+                    zstart = max(zstart, 0);
+                    zend = min(zend, input_shape_data[1]);
+                    hend = min(hend, input_shape_data[2]);
+                    wend = min(wend, input_shape_data[3]);
+
+                    const int pool_index = (pz * output_shape_data[1] + ph) * output_shape_data[2] + pw;
+                    for (int z = zstart; z < zend; ++z) {
+                      for (int h = hstart; h < hend; ++h) {
+                        for (int w = wstart; w < wend; ++w) {
+                          const int index = (z * input_shape_data[2] + h) * input_shape_data[3] + w;
+                          bottom_diff2[index] += top_diff2[pool_index] / pool_size;
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+          break;
+        case PoolingParameter_PoolMethod_STOCHASTIC:
+          NOT_IMPLEMENTED;
+          break;
+        default:
+          LOG(FATAL) << "Unknown pooling method.";
+      }
+    } else {
+      NOT_IMPLEMENTED;
+    }
 }
 
 
diff --git a/src/caffe/layers/prelu_layer.cpp b/src/caffe/layers/prelu_layer.cpp
index aa102d7f6..9e661255e 100644
--- a/src/caffe/layers/prelu_layer.cpp
+++ b/src/caffe/layers/prelu_layer.cpp
@@ -51,7 +51,7 @@ void PReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   CHECK_GE(bottom[0]->num_axes(), 2)
       << "Number of axes of bottom blob must be >=2.";
   PReLUParameter prelu_param = this->layer_param().prelu_param();
-  int channels = bottom[0]->channels();
+  int channels = bottom[0]->shape(1);
   channel_shared_ = prelu_param.channel_shared();
   if (this->blobs_.size() > 0) {
     LOG(INFO) << "Skipping parameter initialization";
@@ -105,9 +105,9 @@ void PReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  const int count = bottom[0]->count();
+  const long count = bottom[0]->count();
   const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
+  const int channels = bottom[0]->shape(1);
   const Dtype* slope_data = this->blobs_[0]->cpu_data();
 
   // For in-place computation
@@ -118,8 +118,8 @@ void PReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // if channel_shared, channel index in the following computation becomes
   // always zero.
   const int div_factor = channel_shared_ ? channels : 1;
-  for (int i = 0; i < count; ++i) {
-    int c = (i / dim) % channels / div_factor;
+  for (long i = 0; i < count; ++i) {
+    long c = (i / dim) % channels / div_factor;
     top_data[i] = std::max(bottom_data[i], Dtype(0))
         + slope_data[c] * std::min(bottom_data[i], Dtype(0));
   }
@@ -132,9 +132,9 @@ void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const Dtype* slope_data = this->blobs_[0]->cpu_data();
   const Dtype* top_diff = top[0]->cpu_diff();
-  const int count = bottom[0]->count();
-  const int dim = bottom[0]->count(2);
-  const int channels = bottom[0]->channels();
+  const long count = bottom[0]->count();
+  const long dim = bottom[0]->count(2);
+  const int channels = bottom[0]->shape(1);
 
   // For in-place computation
   if (top[0] == bottom[0]) {
@@ -151,16 +151,16 @@ void PReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   // keep top_diff unchanged.
   if (this->param_propagate_down_[0]) {
     Dtype* slope_diff = this->blobs_[0]->mutable_cpu_diff();
-    for (int i = 0; i < count; ++i) {
-      int c = (i / dim) % channels / div_factor;
+    for (long i = 0; i < count; ++i) {
+      long c = (i / dim) % channels / div_factor;
       slope_diff[c] += top_diff[i] * bottom_data[i] * (bottom_data[i] <= 0);
     }
   }
   // Propagate to bottom
   if (propagate_down[0]) {
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    for (int i = 0; i < count; ++i) {
-      int c = (i / dim) % channels / div_factor;
+    for (long i = 0; i < count; ++i) {
+      long c = (i / dim) % channels / div_factor;
       bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
           + slope_data[c] * (bottom_data[i] <= 0));
     }
diff --git a/src/caffe/layers/psroi_pooling_layer.cpp b/src/caffe/layers/psroi_pooling_layer.cpp
new file mode 100644
index 000000000..06bf6ee03
--- /dev/null
+++ b/src/caffe/layers/psroi_pooling_layer.cpp
@@ -0,0 +1,296 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include <cfloat>
+#include <algorithm>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "caffe/layers/psroi_pooling_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+using std::max;
+using std::min;
+using std::floor;
+using std::ceil;
+
+namespace caffe {
+  template <typename Dtype>
+  void PSROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+    PSROIPoolingParameter psroi_pooling_param =
+      this->layer_param_.psroi_pooling_param();
+    spatial_scale_ = psroi_pooling_param.spatial_scale();
+    LOG(ERROR) << "Spatial scale: " << spatial_scale_;
+
+    CHECK_GT(psroi_pooling_param.output_dim(), 0)
+      << "output_dim must be > 0";
+    CHECK_GT(psroi_pooling_param.group_size(), 0)
+      << "group_size must be > 0";
+
+    output_dim_ = psroi_pooling_param.output_dim();
+    group_size_ = psroi_pooling_param.group_size();
+    pooled_height_ = group_size_;
+    pooled_width_ = group_size_;
+  }
+
+  template <typename Dtype>
+  void PSROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+    // LOG(INFO) << "psroi pooling reshape";
+    channels_ = bottom[0]->channels();
+    CHECK_EQ(channels_, output_dim_ * group_size_ * group_size_)
+      << "input channel number does not match layer parameters";
+    height_ = bottom[0]->height();
+    width_ = bottom[0]->width();
+    top[0]->Reshape(
+      bottom[1]->num(), output_dim_, pooled_height_, pooled_width_);
+    mapping_channel_.Reshape(
+      bottom[1]->num(), output_dim_, pooled_height_, pooled_width_);
+  }
+
+
+  template <typename Dtype>
+  static void PSROIPoolingForward(
+    const int num,
+    const Dtype* bottom_data,
+    const Dtype spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const Dtype* bottom_rois,
+    const int output_dim,
+    const int group_size,
+    Dtype* top_data,
+    int* mapping_channel) {
+      // LOG(INFO) << "psroi pooling cpu_forward";
+      int pixels = width * height;
+#ifdef _OPENMP
+	#pragma omp parallel for
+#endif
+     for (int n = 0; n < num; ++n) {
+        // per roi
+
+        int roi_add = n * 5;
+        // [start, end) interval for spatial sampling
+        int roi_batch_ind = bottom_rois[roi_add];
+        Dtype roi_start_w =
+          static_cast<Dtype>(round(bottom_rois[roi_add + 1])) * spatial_scale;
+        Dtype roi_start_h =
+          static_cast<Dtype>(round(bottom_rois[roi_add + 2])) * spatial_scale;
+        Dtype roi_end_w =
+          static_cast<Dtype>(round(bottom_rois[roi_add + 3]) + 1.) * spatial_scale;
+        Dtype roi_end_h =
+          static_cast<Dtype>(round(bottom_rois[roi_add + 4]) + 1.) * spatial_scale;
+
+        // Force too small ROIs to be 1x1
+        Dtype roi_width = max<Dtype>(roi_end_w - roi_start_w, 0.1);  // avoid 0
+        Dtype roi_height = max<Dtype>(roi_end_h - roi_start_h, 0.1);
+
+        // Compute w and h at bottom
+        Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
+        Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
+
+        int top_roi_offset = n * output_dim * pooled_height * pooled_width;
+        for (int ctop = 0; ctop < output_dim; ++ctop) {
+          // per category
+          int top_plane_offset = top_roi_offset + ctop * pooled_height * pooled_width;
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            int top_row_offset = top_plane_offset + ph * pooled_width;
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int index = top_row_offset + pw;
+              // The output is in order (n, ctop, ph, pw)
+              int hstart = floor(static_cast<Dtype>(ph) * bin_size_h + roi_start_h);
+              int wstart = floor(static_cast<Dtype>(pw) * bin_size_w + roi_start_w);
+              int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h + roi_start_h);
+              int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w + roi_start_w);
+              // Add roi offsets and clip to input boundaries
+              hstart = min(max(hstart, 0), height);
+              hend = min(max(hend, 0), height);
+              wstart = min(max(wstart, 0), width);
+              wend = min(max(wend, 0), width);
+
+              bool is_empty = (hend <= hstart) || (wend <= wstart);
+              int gw = pw;
+              int gh = ph;
+              int c = (ctop * group_size + gh) * group_size + gw;
+
+              Dtype out_sum = 0;
+              int bottom_base_offset = (roi_batch_ind * channels + c) * pixels;
+              const Dtype *current_bottom = bottom_data + bottom_base_offset;
+              for (int h = hstart; h < hend; ++h) {
+                int roi_row_offset = h * width;
+                for (int w = wstart; w < wend; ++w) {
+                  int bottom_index = roi_row_offset + w;
+                  out_sum += current_bottom[bottom_index];
+                }
+              }
+
+              Dtype bin_area = (hend - hstart) * (wend - wstart);
+              top_data[index] = is_empty ? 0. : out_sum / bin_area;
+
+              mapping_channel[index] = c;
+            }
+          }
+        }
+    }
+  }
+
+
+  template <typename Dtype>
+  void PSROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* bottom_rois = bottom[1]->cpu_data();
+    Dtype* top_data = top[0]->mutable_cpu_data();
+    int* mapping_channel_ptr = mapping_channel_.mutable_cpu_data();
+    int count = top[0]->count();
+    caffe_set(count, Dtype(0), top_data);
+    caffe_set(count, -1, mapping_channel_ptr);
+    
+    PSROIPoolingForward(bottom[1]->num(), bottom_data, spatial_scale_,
+      channels_, height_, width_, pooled_height_,
+      pooled_width_, bottom_rois, output_dim_, group_size_,
+      top_data, mapping_channel_ptr);
+  }
+
+  template <typename Dtype>
+    static void PSROIPoolingBackward(
+    const int num,
+    const Dtype* top_diff,
+    const int* mapping_channel,
+    const int num_rois,
+    const Dtype spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    Dtype* bottom_diff,
+    const Dtype* bottom_rois) {
+	// LOG(INFO) << "psroipooling backward cpu";
+    int pixels = height * width;
+#ifdef _OPENMP
+ 	#pragma omp parallel for
+#endif
+    for (int i = 0; i < num; ++i) {
+      // The output is in order (n, ctop, ph, pw)
+      int pw = i % pooled_width;
+      int ph = (i / pooled_width) % pooled_height;
+      int n = i / pooled_width / pooled_height / output_dim;
+
+      // [start, end) interval for spatial sampling
+      int roi_add = n * 5;
+      int roi_batch_ind = bottom_rois[roi_add];
+      Dtype roi_start_w =
+        static_cast<Dtype>(round(bottom_rois[roi_add + 1])) * spatial_scale;
+      Dtype roi_start_h =
+        static_cast<Dtype>(round(bottom_rois[roi_add + 2])) * spatial_scale;
+      Dtype roi_end_w =
+        static_cast<Dtype>(round(bottom_rois[roi_add + 3]) + 1.) * spatial_scale;
+      Dtype roi_end_h =
+        static_cast<Dtype>(round(bottom_rois[roi_add + 4]) + 1.) * spatial_scale;
+
+      // Force too small ROIs to be 1x1
+      Dtype roi_width = max(roi_end_w - roi_start_w, (Dtype)0.1);  // avoid 0
+      Dtype roi_height = max(roi_end_h - roi_start_h, (Dtype)0.1);
+
+      // Compute w and h at bottom
+      Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
+      Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
+
+      int hstart = floor(static_cast<Dtype>(ph)* bin_size_h + roi_start_h);
+      int wstart = floor(static_cast<Dtype>(pw)* bin_size_w + roi_start_w);
+      int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h + roi_start_h);
+      int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w + roi_start_w);
+      // Add roi offsets and clip to input boundaries
+      hstart = min(max(hstart, 0), height);
+      hend = min(max(hend, 0), height);
+      wstart = min(max(wstart, 0), width);
+      wend = min(max(wend, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      // Compute c at bottom
+      int c = mapping_channel[i];
+      Dtype* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * pixels;
+      Dtype bin_area = (hend - hstart) * (wend - wstart);
+      Dtype diff_val = is_empty ? 0. : top_diff[i] / bin_area;
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          offset_bottom_diff[h * width + w] += diff_val;
+        }
+      }
+    }
+  }
+
+  template <typename Dtype>
+  void PSROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+
+    if (!propagate_down[0]) {
+      return;
+    }
+
+    const Dtype* bottom_rois = bottom[1]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int bottom_count = bottom[0]->count();
+    const int* mapping_channel_ptr = mapping_channel_.cpu_data();
+    caffe_set(bottom[1]->count(), Dtype(0), bottom[1]->mutable_cpu_diff());
+    caffe_set(bottom_count, Dtype(0), bottom_diff);
+    const int count = top[0]->count();
+    PSROIPoolingBackward(count, top_diff, mapping_channel_ptr,
+      top[0]->num(), spatial_scale_, channels_, height_, width_,
+      pooled_height_, pooled_width_, output_dim_, bottom_diff,
+      bottom_rois);
+  }
+
+
+#ifdef CPU_ONLY
+  STUB_GPU(PSROIPoolingLayer);
+#endif
+
+  INSTANTIATE_CLASS(PSROIPoolingLayer);
+  REGISTER_LAYER_CLASS(PSROIPooling);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/psroi_pooling_layer.cu b/src/caffe/layers/psroi_pooling_layer.cu
new file mode 100644
index 000000000..487408f82
--- /dev/null
+++ b/src/caffe/layers/psroi_pooling_layer.cu
@@ -0,0 +1,238 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layers/psroi_pooling_layer.hpp"
+#include "caffe/util/gpu_util.cuh"
+
+using std::max;
+using std::min;
+
+namespace caffe {
+
+  template <typename Dtype>
+  __global__ void PSROIPoolingForward(
+    const int nthreads,
+    const Dtype* bottom_data,
+    const Dtype spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const Dtype* bottom_rois,
+    const int output_dim,
+    const int group_size,
+    Dtype* top_data,
+    int* mapping_channel) {
+    CUDA_KERNEL_LOOP(index, nthreads) {
+      // The output is in order (n, ctop, ph, pw)
+      int pw = index % pooled_width;
+      int ph = (index / pooled_width) % pooled_height;
+      int ctop = (index / pooled_width / pooled_height) % output_dim;
+      int n = index / pooled_width / pooled_height / output_dim;
+
+      // [start, end) interval for spatial sampling
+      bottom_rois += n * 5;
+      int roi_batch_ind = bottom_rois[0];
+      Dtype roi_start_w =
+        static_cast<Dtype>(round(bottom_rois[1])) * spatial_scale;
+      Dtype roi_start_h =
+        static_cast<Dtype>(round(bottom_rois[2])) * spatial_scale;
+      Dtype roi_end_w =
+        static_cast<Dtype>(round(bottom_rois[3]) + 1.) * spatial_scale;
+      Dtype roi_end_h =
+        static_cast<Dtype>(round(bottom_rois[4]) + 1.) * spatial_scale;
+
+      // Force too small ROIs to be 1x1
+      Dtype roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+      Dtype roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+      // Compute w and h at bottom
+      Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
+      Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
+
+      int hstart = floor(static_cast<Dtype>(ph) * bin_size_h
+                          + roi_start_h);
+      int wstart = floor(static_cast<Dtype>(pw)* bin_size_w
+                          + roi_start_w);
+      int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h
+                        + roi_start_h);
+      int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w
+                        + roi_start_w);
+      // Add roi offsets and clip to input boundaries
+      hstart = min(max(hstart, 0), height);
+      hend = min(max(hend, 0), height);
+      wstart = min(max(wstart, 0), width);
+      wend = min(max(wend, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      int gw = pw;
+      int gh = ph;
+      int c = (ctop*group_size + gh)*group_size + gw;
+
+      bottom_data += (roi_batch_ind * channels + c) * height * width;
+      Dtype out_sum = 0;
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          int bottom_index = h*width + w;
+          out_sum += bottom_data[bottom_index];
+        }
+      }
+
+      Dtype bin_area = (hend - hstart)*(wend - wstart);
+      top_data[index] = is_empty? 0. : out_sum/bin_area;
+      mapping_channel[index] = c;
+    }
+  }
+
+  template <typename Dtype>
+  void PSROIPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* bottom_rois = bottom[1]->gpu_data();
+    Dtype* top_data = top[0]->mutable_gpu_data();
+    int* mapping_channel_ptr = mapping_channel_.mutable_gpu_data();
+    int count = top[0]->count();
+    caffe_gpu_set(count, Dtype(0), top_data);
+    caffe_gpu_set(count, -1, mapping_channel_ptr);
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    PSROIPoolingForward<Dtype> << <CAFFE_GET_BLOCKS(count),
+      CAFFE_CUDA_NUM_THREADS >> >(count, bottom_data, spatial_scale_,
+      channels_, height_, width_, pooled_height_,
+      pooled_width_, bottom_rois, output_dim_, group_size_,
+      top_data, mapping_channel_ptr);
+    CUDA_POST_KERNEL_CHECK;
+  }
+
+  template <typename Dtype>
+  __global__ void PSROIPoolingBackwardAtomic(
+    const int nthreads,
+    const Dtype* top_diff,
+    const int* mapping_channel,
+    const int num_rois,
+    const Dtype spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    Dtype* bottom_diff,
+    const Dtype* bottom_rois) {
+    CUDA_KERNEL_LOOP(index, nthreads) {
+      // The output is in order (n, ctop, ph, pw)
+      int pw = index % pooled_width;
+      int ph = (index / pooled_width) % pooled_height;
+      int n = index / pooled_width / pooled_height / output_dim;
+
+      // [start, end) interval for spatial sampling
+      bottom_rois += n * 5;
+      int roi_batch_ind = bottom_rois[0];
+      Dtype roi_start_w =
+        static_cast<Dtype>(round(bottom_rois[1])) * spatial_scale;
+      Dtype roi_start_h =
+        static_cast<Dtype>(round(bottom_rois[2])) * spatial_scale;
+      Dtype roi_end_w =
+        static_cast<Dtype>(round(bottom_rois[3]) + 1.) * spatial_scale;
+      Dtype roi_end_h =
+        static_cast<Dtype>(round(bottom_rois[4]) + 1.) * spatial_scale;
+
+      // Force too small ROIs to be 1x1
+      Dtype roi_width = max(roi_end_w - roi_start_w, 0.1);  // avoid 0
+      Dtype roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+      // Compute w and h at bottom
+      Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
+      Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
+
+      int hstart = floor(static_cast<Dtype>(ph)* bin_size_h
+        + roi_start_h);
+      int wstart = floor(static_cast<Dtype>(pw)* bin_size_w
+        + roi_start_w);
+      int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h
+        + roi_start_h);
+      int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w
+        + roi_start_w);
+      // Add roi offsets and clip to input boundaries
+      hstart = min(max(hstart, 0), height);
+      hend = min(max(hend, 0), height);
+      wstart = min(max(wstart, 0), width);
+      wend = min(max(wend, 0), width);
+      bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+      // Compute c at bottom
+      int c = mapping_channel[index];
+      Dtype* offset_bottom_diff = bottom_diff +
+        (roi_batch_ind * channels + c) * height * width;
+      Dtype bin_area = (hend - hstart)*(wend - wstart);
+      Dtype diff_val = is_empty ? 0. : top_diff[index] / bin_area;
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          int bottom_index = h*width + w;
+          caffe_gpu_atomic_add(diff_val, offset_bottom_diff + bottom_index);
+        }
+      }
+    }
+  }
+
+  template <typename Dtype>
+  void PSROIPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+    if (!propagate_down[0]) {
+      return;
+    }
+
+    const Dtype* bottom_rois = bottom[1]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int bottom_count = bottom[0]->count();
+    const int* mapping_channel_ptr = mapping_channel_.gpu_data();
+    caffe_gpu_set(bottom[1]->count(), Dtype(0), bottom[1]->mutable_gpu_diff());
+    caffe_gpu_set(bottom_count, Dtype(0), bottom_diff);
+    const int count = top[0]->count();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    PSROIPoolingBackwardAtomic<Dtype> << <CAFFE_GET_BLOCKS(count),
+      CAFFE_CUDA_NUM_THREADS >> >(count, top_diff, mapping_channel_ptr,
+      top[0]->num(), spatial_scale_, channels_, height_, width_,
+      pooled_height_, pooled_width_, output_dim_, bottom_diff,
+      bottom_rois);
+    CUDA_POST_KERNEL_CHECK;
+  }
+
+  INSTANTIATE_LAYER_GPU_FUNCS(PSROIPoolingLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/roi_pooling_layer.cpp b/src/caffe/layers/roi_pooling_layer.cpp
new file mode 100644
index 000000000..438a2d749
--- /dev/null
+++ b/src/caffe/layers/roi_pooling_layer.cpp
@@ -0,0 +1,376 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layers/roi_pooling_layer.hpp"
+
+using std::max;
+using std::min;
+using std::floor;
+using std::ceil;
+
+namespace caffe {
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+                                        const vector<Blob<Dtype>*>& top) {
+  const int num_axes = bottom[0]->num_axes();
+  num_spatial_axes_ = num_axes - 2;
+
+  ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param();
+  if (num_spatial_axes_ == 2) {
+    if (roi_pool_param.pooled_size_size()) {
+      CHECK(roi_pool_param.pooled_size_size() == 1 || roi_pool_param.pooled_size_size() == 2)
+        << "pooled_size must be specified once, or 2 values for Height and Width";
+      if (roi_pool_param.pooled_size_size() == 1) {
+         pooled_h_ = pooled_w_ = roi_pool_param.pooled_size(0);
+      } else {
+         pooled_h_ = roi_pool_param.pooled_size(0);
+         pooled_w_ = roi_pool_param.pooled_size(1);
+      }
+    } else {
+      pooled_h_ = roi_pool_param.pooled_h();
+      pooled_w_ = roi_pool_param.pooled_w();
+    }
+    CHECK_GT(pooled_h_, 0) << "pooled_h must be > 0";
+    CHECK_GT(pooled_w_, 0) << "pooled_w must be > 0";
+  } else if (num_spatial_axes_ == 3) {
+    if (roi_pool_param.has_pooled_h() || roi_pool_param.has_pooled_w()) {
+      CHECK_EQ(num_spatial_axes_, 2)
+        << "kernel_h & kernel_w can only be used for 2D pooling.";
+      CHECK_EQ(0, roi_pool_param.pooled_size_size())
+        << "Either kernel_size or kernel_h/w should be specified, not both.";
+      pooled_h_ = roi_pool_param.pooled_h();
+      pooled_w_ = roi_pool_param.pooled_w();
+    } else {
+      const int num_dims = roi_pool_param.pooled_size_size();
+      CHECK(num_dims == 1 || num_dims == num_spatial_axes_)
+        << "pooled_size must be specified once, or once per spatial dimension"
+        << " (pooled_size specified " << num_dims << " times "
+        << num_spatial_axes_ << " spatial dims).";
+      if (num_dims == 1) {
+        pooled_d_ = pooled_h_ = pooled_w_ = roi_pool_param.pooled_size(0);
+      } else {
+        pooled_d_ = roi_pool_param.pooled_size(0);
+        pooled_h_ = roi_pool_param.pooled_size(1);
+        pooled_w_ = roi_pool_param.pooled_size(2);
+      }
+
+      CHECK_GT(pooled_d_, 0) << "pooled_d must be > 0";
+      CHECK_GT(pooled_h_, 0) << "pooled_h must be > 0";
+      CHECK_GT(pooled_w_, 0) << "pooled_w must be > 0";
+    }
+  } else {
+    NOT_IMPLEMENTED;
+  }
+
+  spatial_scale_ = roi_pool_param.spatial_scale();
+  LOG(INFO) << "Spatial scale: " << spatial_scale_;
+}
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+                                     const vector<Blob<Dtype>*>& top) {
+  channels_ = bottom[0]->shape(1);
+  if (num_spatial_axes_ == 2) {
+    height_ = bottom[0]->height();
+    width_ = bottom[0]->width();
+    top[0]->Reshape(bottom[1]->shape(0), channels_, pooled_h_, pooled_w_);
+    max_idx_.Reshape(bottom[1]->shape(0), channels_, pooled_h_, pooled_w_);
+  } else {
+    depth_ = bottom[0]->shape(2);
+    height_ = bottom[0]->shape(3);
+    width_ = bottom[0]->shape(4);
+    vector<int> pooled_shape(bottom[0]->num_axes(), 0);
+    pooled_shape[0] = bottom[1]->shape(0);
+    pooled_shape[1] = channels_;
+    pooled_shape[2] = pooled_d_;
+    pooled_shape[3] = pooled_h_;
+    pooled_shape[4] = pooled_w_;
+    top[0]->Reshape(pooled_shape);
+    max_idx_.Reshape(pooled_shape);
+  }
+}
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+                                         const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  const Dtype* bottom_rois = bottom[1]->cpu_data();
+  // Number of ROIs
+  int num_rois = bottom[1]->shape(0);
+
+  int batch_size = bottom[0]->shape(0);
+  size_t top_count = top[0]->count();
+
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  caffe_set(top_count, Dtype(-FLT_MAX), top_data);
+  int* argmax_data = max_idx_.mutable_cpu_data();
+  caffe_set(top_count, -1, argmax_data);
+
+  if (num_spatial_axes_ == 2) {
+    int roi_offset = bottom[1]->offset(1);
+    size_t top_offset = top[0]->offset(1);
+    // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
+#ifdef _OPENMP
+    #pragma omp parallel for
+#endif
+    for (int n = 0; n < num_rois; ++n) {
+      Dtype* cur_top = top_data + n * top_offset;
+      int* cur_argmax = argmax_data + n * top_offset;
+      const Dtype* roi = bottom_rois + n * roi_offset;
+      int roi_batch_ind = roi[0];
+      int roi_start_w = round(roi[1] * spatial_scale_);
+      int roi_start_h = round(roi[2] * spatial_scale_);
+      int roi_end_w = round(roi[3] * spatial_scale_);
+      int roi_end_h = round(roi[4] * spatial_scale_);
+      CHECK_GE(roi_batch_ind, 0);
+      CHECK_LT(roi_batch_ind, batch_size);
+
+      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+      const Dtype bin_size_h = static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_h_);
+      const Dtype bin_size_w = static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_w_);
+
+      const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind);
+
+      for (int c = 0; c < channels_; ++c) {
+        for (int ph = 0; ph < pooled_h_; ++ph) {
+          for (int pw = 0; pw < pooled_w_; ++pw) {
+            // Compute pooling region for this output unit:
+            //  start (included) = floor(ph * roi_height / pooled_h_)
+            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_h_)
+            int hstart = static_cast<int>(floor(static_cast<Dtype>(ph) * bin_size_h));
+            int wstart = static_cast<int>(floor(static_cast<Dtype>(pw) * bin_size_w));
+            int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1) * bin_size_h));
+            int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1) * bin_size_w));
+
+            hstart = min(max(hstart + roi_start_h, 0), height_);
+            hend = min(max(hend + roi_start_h, 0), height_);
+            wstart = min(max(wstart + roi_start_w, 0), width_);
+            wend = min(max(wend + roi_start_w, 0), width_);
+
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+            const int pool_index = ph * pooled_w_ + pw;
+            if (is_empty) {
+              cur_top[pool_index] = 0;
+              cur_argmax[pool_index] = -1;
+              continue;
+            }
+
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int index = h * width_ + w;
+                if (batch_data[index] > cur_top[pool_index]) {
+                  cur_top[pool_index] = batch_data[index];
+                  cur_argmax[pool_index] = index;
+                }
+              }
+            }
+          }
+        }
+
+        // Increment all data pointers by one channel
+        batch_data += bottom[0]->offset(0, 1);
+        cur_top += top[0]->offset(0, 1);
+        cur_argmax += max_idx_.offset(0, 1);
+      }
+    }
+  } else if (num_spatial_axes_ == 3) {
+    vector<int> roi_offset_vec(1, 0);
+    roi_offset_vec[0] = 1;
+    int roi_offset = bottom[1]->offset(roi_offset_vec);
+
+    vector<int> top_offset_vec(1, 0);
+    top_offset_vec[0] = 1;
+    size_t top_offset = top[0]->offset(top_offset_vec);
+
+#ifdef _OPENMP
+    #pragma omp parallel for
+#endif
+    for (int n = 0; n < num_rois; ++n) {
+      Dtype* cur_top = top_data + n * top_offset;
+      int* cur_argmax = argmax_data + n * top_offset;
+      const Dtype* roi = bottom_rois + n * roi_offset;
+      int roi_batch_ind = roi[0];
+      int roi_start_d = round(roi[1] * spatial_scale_);
+      int roi_start_w = round(roi[2] * spatial_scale_);
+      int roi_start_h = round(roi[3] * spatial_scale_);
+      int roi_end_d = round(roi[4] * spatial_scale_);
+      int roi_end_w = round(roi[5] * spatial_scale_);
+      int roi_end_h = round(roi[6] * spatial_scale_);
+      CHECK_GE(roi_batch_ind, 0);
+      CHECK_LT(roi_batch_ind, batch_size);
+
+      int roi_depth = max(roi_end_d - roi_start_d + 1, 1);
+      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+      const Dtype bin_size_d = static_cast<Dtype>(roi_depth) / static_cast<Dtype>(pooled_d_);
+      const Dtype bin_size_h = static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_h_);
+      const Dtype bin_size_w = static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_w_);
+
+      vector<int> roi_batch_ind_offset(1, 0);
+      roi_batch_ind_offset[0] = roi_batch_ind;
+      const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind_offset);
+
+      for (int c = 0; c < channels_; ++c) {
+        for (int pd = 0; pd < pooled_d_; ++pd) {
+          for (int ph = 0; ph < pooled_h_; ++ph) {
+            for (int pw = 0; pw < pooled_w_; ++pw) {
+              int dstart = static_cast<int>(floor(static_cast<Dtype>(pd) * bin_size_d));
+              int hstart = static_cast<int>(floor(static_cast<Dtype>(ph) * bin_size_h));
+              int wstart = static_cast<int>(floor(static_cast<Dtype>(pw) * bin_size_w));
+              int dend = static_cast<int>(ceil(static_cast<Dtype>(pd + 1) * bin_size_d));
+              int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1) * bin_size_h));
+              int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1) * bin_size_w));
+
+              dstart = min(max(dstart + roi_start_d, 0), depth_);
+              dend = min(max(dend + roi_start_d, 0), depth_);
+              hstart = min(max(hstart + roi_start_h, 0), height_);
+              hend = min(max(hend + roi_start_h, 0), height_);
+              wstart = min(max(wstart + roi_start_w, 0), width_);
+              wend = min(max(wend + roi_start_w, 0), width_);
+
+              bool is_empty = (dend <= dstart) || (hend <= hstart) || (wend <= wstart);
+
+              const int pool_index = (pd * pooled_h_ + ph) * pooled_w_ + pw;
+              if (is_empty) {
+                cur_top[pool_index] = 0;
+                cur_argmax[pool_index] = -1;
+                continue;
+              }
+
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    const int index = (d * height_ + h) * width_ + w;
+                    if (batch_data[index] > cur_top[pool_index]) {
+                       cur_top[pool_index] = batch_data[index];
+                       cur_argmax[pool_index] = index;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        // Increment all data pointers by one channel
+        vector<int> offset_vec(2, 0);
+        offset_vec[1] = 1;
+        batch_data += bottom[0]->offset(offset_vec);
+        cur_top += top[0]->offset(offset_vec);
+        cur_argmax += max_idx_.offset(offset_vec);
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+                                          const vector<bool>& propagate_down,
+                                          const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+
+  const Dtype* bottom_rois = bottom[1]->cpu_data();
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  caffe_set(bottom[0]->count(), Dtype(0.), bottom_diff);
+  const int* argmax_data = max_idx_.cpu_data();
+  const int num_rois = top[0]->shape(0);
+
+  if (num_spatial_axes_ == 2) {
+    // Accumulate gradient over all ROIs
+    for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
+      int roi_batch_ind = bottom_rois[roi_n * 5];
+      // Accumulate gradients over each bin in this ROI
+#ifdef _OPENMP
+      #pragma omp parallel for
+#endif
+      for (int c = 0; c < channels_; ++c) {
+        for (int ph = 0; ph < pooled_h_; ++ph) {
+          for (int pw = 0; pw < pooled_w_; ++pw) {
+            int offset_top = ((roi_n * channels_ + c) * pooled_h_ + ph) * pooled_w_ + pw;
+            int argmax_index = argmax_data[offset_top];
+            if (argmax_index >= 0) {
+              int offset_bottom = (roi_batch_ind * channels_ + c) * height_ * width_ + argmax_index;
+              bottom_diff[offset_bottom] += top_diff[offset_top];
+            }
+          }
+        }
+      }
+    }
+  } else if (num_spatial_axes_ == 3) {
+     // Accumulate gradient over all ROIs
+    for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
+      int roi_batch_ind = bottom_rois[roi_n * 7];
+      // Accumulate gradients over each bin in this ROI
+#ifdef _OPENMP
+      #pragma omp parallel for
+#endif
+      for (int c = 0; c < channels_; ++c) {
+        for (int pd = 0; pd < pooled_d_; ++pd) {
+          for (int ph = 0; ph < pooled_h_; ++ph) {
+            for (int pw = 0; pw < pooled_w_; ++pw) {
+              int offset_top = (((roi_n * channels_ + c) * pooled_d_ + pd) * pooled_h_ + ph) * pooled_w_ + pw;
+              int argmax_index = argmax_data[offset_top];
+              if (argmax_index >= 0) {
+                int offset_bottom = (roi_batch_ind * channels_ + c) * depth_ * height_ * width_ + argmax_index;
+                bottom_diff[offset_bottom] += top_diff[offset_top];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(ROIPoolingLayer);
+#endif
+
+INSTANTIATE_CLASS(ROIPoolingLayer);
+REGISTER_LAYER_CLASS(ROIPooling);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/roi_pooling_layer.cu b/src/caffe/layers/roi_pooling_layer.cu
new file mode 100644
index 000000000..51d276f4f
--- /dev/null
+++ b/src/caffe/layers/roi_pooling_layer.cu
@@ -0,0 +1,222 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layers/roi_pooling_layer.hpp"
+
+
+using std::max;
+using std::min;
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void ROIPoolForward(const int nthreads, const Dtype* bottom_data,
+    const Dtype spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const Dtype* bottom_rois, Dtype* top_data, int* argmax_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    bottom_rois += n * 5;
+    int roi_batch_ind = bottom_rois[0];
+    int roi_start_w = round(bottom_rois[1] * spatial_scale);
+    int roi_start_h = round(bottom_rois[2] * spatial_scale);
+    int roi_end_w = round(bottom_rois[3] * spatial_scale);
+    int roi_end_h = round(bottom_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    Dtype bin_size_h = static_cast<Dtype>(roi_height)
+                       / static_cast<Dtype>(pooled_height);
+    Dtype bin_size_w = static_cast<Dtype>(roi_width)
+                       / static_cast<Dtype>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
+                                        * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
+                                        * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
+                                     * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
+                                     * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Define an empty pooling region to be zero
+    Dtype maxval = is_empty ? 0 : -FLT_MAX;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    int maxidx = -1;
+    bottom_data += (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int bottom_index = h * width + w;
+        if (bottom_data[bottom_index] > maxval) {
+          maxval = bottom_data[bottom_index];
+          maxidx = bottom_index;
+        }
+      }
+    }
+    top_data[index] = maxval;
+    argmax_data[index] = maxidx;
+  }
+}
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* bottom_rois = bottom[1]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  int* argmax_data = max_idx_.mutable_gpu_data();
+  int count = top[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ROIPoolForward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, spatial_scale_, channels_, height_, width_,
+      pooled_height_, pooled_width_, bottom_rois, top_data, argmax_data);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+__global__ void ROIPoolBackward(const int nthreads, const Dtype* top_diff,
+    const int* argmax_data, const int num_rois, const Dtype spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, Dtype* bottom_diff,
+    const Dtype* bottom_rois) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    // (n, c, h, w) coords in bottom data
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    Dtype gradient = 0;
+    // Accumulate gradient over all ROIs that pooled this element
+    for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
+      const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
+      int roi_batch_ind = offset_bottom_rois[0];
+      // Skip if ROI's batch index doesn't match n
+      if (n != roi_batch_ind) {
+        continue;
+      }
+
+      int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+      int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+      int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+      int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+      // Skip if ROI doesn't include (h, w)
+      const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
+                           h >= roi_start_h && h <= roi_end_h);
+      if (!in_roi) {
+        continue;
+      }
+
+      int offset = (roi_n * channels + c) * pooled_height * pooled_width;
+      const Dtype* offset_top_diff = top_diff + offset;
+      const int* offset_argmax_data = argmax_data + offset;
+
+      // Compute feasible set of pooled units that could have pooled
+      // this bottom unit
+
+      // Force malformed ROIs to be 1x1
+      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+
+      Dtype bin_size_h = static_cast<Dtype>(roi_height)
+                         / static_cast<Dtype>(pooled_height);
+      Dtype bin_size_w = static_cast<Dtype>(roi_width)
+                         / static_cast<Dtype>(pooled_width);
+
+      int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);
+      int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);
+      int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);
+      int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);
+
+      phstart = min(max(phstart, 0), pooled_height);
+      phend = min(max(phend, 0), pooled_height);
+      pwstart = min(max(pwstart, 0), pooled_width);
+      pwend = min(max(pwend, 0), pooled_width);
+
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (offset_argmax_data[ph * pooled_width + pw] == (h * width + w)) {
+            gradient += offset_top_diff[ph * pooled_width + pw];
+          }
+        }
+      }
+    }
+    bottom_diff[index] = gradient;
+  }
+}
+
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (!propagate_down[0]) {
+    return;
+  }
+  const Dtype* bottom_rois = bottom[1]->gpu_data();
+  const Dtype* top_diff = top[0]->gpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  const int count = bottom[0]->count();
+  caffe_gpu_set(count, Dtype(0.), bottom_diff);
+  const int* argmax_data = max_idx_.gpu_data();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ROIPoolBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, top_diff, argmax_data, top[0]->num(), spatial_scale_, channels_,
+      height_, width_, pooled_height_, pooled_width_, bottom_diff, bottom_rois);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(ROIPoolingLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
index 046cb9d3a..b9877e6a3 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -5,6 +5,72 @@
 
 namespace caffe {
 
+
+template <typename Dtype>
+__global__ void SigmoidCrossEntropyLossForwardGPU(const int nthreads,
+          const Dtype* input_data, const Dtype* target, Dtype* loss,
+          const bool has_ignore_label_, const int ignore_label_,
+          Dtype* counts) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
+    const int target_value = static_cast<int>(target[i]);
+    if (has_ignore_label_ && target_value == ignore_label_) {
+      loss[i] = 0;
+      counts[i] = 0;
+    } else {
+      loss[i] = input_data[i] * (target[i] - (input_data[i] >= 0)) -
+          log(1 + exp(input_data[i] - 2 * input_data[i] *
+          (input_data[i] >= 0)));
+      counts[i] = 1;
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void SigmoidCrossEntropyLossIgnoreDiffGPU(const int count,
+    const int ignore_label, const Dtype* target, Dtype* diff) {
+  CUDA_KERNEL_LOOP(i, count) {
+    const int target_value = static_cast<int>(target[i]);
+    if (target_value == ignore_label) {
+      diff[i] = 0;
+    }
+  }
+}
+
+
+template <typename Dtype>
+void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  // The forward pass computes the sigmoid outputs.
+  sigmoid_bottom_vec_[0] = bottom[0];
+  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
+  // Compute the loss (negative log likelihood)
+  const int count = bottom[0]->count();
+  // Stable version of loss computation from input data
+  const Dtype* input_data = bottom[0]->gpu_data();
+  const Dtype* target = bottom[1]->gpu_data();
+  // Since this memory is not used for anything until it is overwritten
+  // on the backward pass, we use it here to avoid having to allocate new GPU
+  // memory to accumulate intermediate results in the kernel.
+  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+  Dtype* count_data = bottom[1]->mutable_gpu_diff();
+  Dtype valid_count;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SigmoidCrossEntropyLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
+      CAFFE_CUDA_NUM_THREADS>>>(count, input_data, target, loss_data,
+      has_ignore_label_, ignore_label_, count_data);
+  // Only launch another CUDA kernel if we actually need the valid count.
+  if (normalization_ == LossParameter_NormalizationMode_VALID &&
+      has_ignore_label_) {
+    caffe_gpu_asum(count, count_data, &valid_count);
+  } else {
+    valid_count = count;
+  }
+  Dtype loss;
+  caffe_gpu_asum(count, loss_data, &loss);
+  normalizer_ = get_normalizer(normalization_, valid_count);
+  top[0]->mutable_cpu_data()[0] = loss / normalizer_;
+}
+
 template <typename Dtype>
 void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
     const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
@@ -16,19 +82,23 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Backward_gpu(
   if (propagate_down[0]) {
     // First, compute the diff
     const int count = bottom[0]->count();
-    const int num = bottom[0]->num();
     const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
     const Dtype* target = bottom[1]->gpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
     caffe_copy(count, sigmoid_output_data, bottom_diff);
     caffe_gpu_axpy(count, Dtype(-1), target, bottom_diff);
+    // Zero out gradient of ignored targets.
+    if (has_ignore_label_) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      SigmoidCrossEntropyLossIgnoreDiffGPU<Dtype><<<CAFFE_GET_BLOCKS(count),
+        CAFFE_CUDA_NUM_THREADS>>>(count, ignore_label_, target, bottom_diff);
+    }
     // Scale down gradient
-    const Dtype loss_weight = top[0]->cpu_diff()[0];
-    caffe_gpu_scal(count, loss_weight / num, bottom_diff);
+    Dtype loss_weight = top[0]->cpu_diff()[0] / normalizer_;
+    caffe_gpu_scal(count, loss_weight, bottom_diff);
   }
 }
 
-INSTANTIATE_LAYER_GPU_BACKWARD(SigmoidCrossEntropyLossLayer);
-
+INSTANTIATE_LAYER_GPU_FUNCS(SigmoidCrossEntropyLossLayer);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/smooth_L1_loss_layer.cpp b/src/caffe/layers/smooth_L1_loss_layer.cpp
index 384ce3c9f..a660a1fa8 100644
--- a/src/caffe/layers/smooth_L1_loss_layer.cpp
+++ b/src/caffe/layers/smooth_L1_loss_layer.cpp
@@ -53,8 +53,12 @@ namespace caffe {
 template <typename Dtype>
 void SmoothL1LossLayer<Dtype>::LayerSetUp(
   const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  LossLayer<Dtype>::LayerSetUp(bottom, top);
-  has_weights_ = (bottom.size() == 3);
+  SmoothL1LossParameter loss_param = this->layer_param_.smooth_l1_loss_param();
+  sigma2_ = loss_param.sigma() * loss_param.sigma();
+  has_weights_ = (bottom.size() >= 3);
+  if (has_weights_) {
+    CHECK_EQ(bottom.size(), 4) << "If weights are used, must specify both inside and outside weights";
+  }
 }
 
 template <typename Dtype>
@@ -68,71 +72,98 @@ void SmoothL1LossLayer<Dtype>::Reshape(
     CHECK_EQ(bottom[0]->channels(), bottom[2]->channels());
     CHECK_EQ(bottom[0]->height(), bottom[2]->height());
     CHECK_EQ(bottom[0]->width(), bottom[2]->width());
+    CHECK_EQ(bottom[0]->channels(), bottom[3]->channels());
+    CHECK_EQ(bottom[0]->height(), bottom[3]->height());
+    CHECK_EQ(bottom[0]->width(), bottom[3]->width());
   }
   diff_.Reshape(bottom[0]->num(), bottom[0]->channels(),
       bottom[0]->height(), bottom[0]->width());
   errors_.Reshape(bottom[0]->num(), bottom[0]->channels(),
       bottom[0]->height(), bottom[0]->width());
+
+  // vector of ones used to sum
+  ones_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  for (int i = 0; i < bottom[0]->count(); ++i) {
+    ones_.mutable_cpu_data()[i] = Dtype(1);
+  }
 }
 
 template <typename Dtype>
-void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-    const vector<Blob<Dtype>*>& top) {
-  int count = bottom[0]->count();
-  caffe_sub(
-      count,
-      bottom[0]->cpu_data(),
-      bottom[1]->cpu_data(),
-      diff_.mutable_cpu_data());
-  if (has_weights_) {
-    caffe_mul(
-        count,
-        bottom[2]->cpu_data(),
-        diff_.cpu_data(),
-        diff_.mutable_cpu_data());  // d := w * (b0 - b1)
-  }
-  const Dtype* diff_data = diff_.cpu_data();
-  Dtype* error_data = errors_.mutable_cpu_data();
-  for (int i = 0; i < count; ++i) {
-    Dtype val = diff_data[i];
-    Dtype abs_val = fabs(val);
-    if (abs_val < 1.) {
-      error_data[i] = 0.5 * val * val;
-    } else {
-      error_data[i] = abs_val - 0.5;
+void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) 
+{
+    int count = bottom[0]->count();
+    caffe_sub(count, bottom[0]->cpu_data(), bottom[1]->cpu_data(), diff_.mutable_cpu_data());    // d := b0 - b1
+    if (has_weights_) {
+        // apply "inside" weights
+        caffe_mul(count, bottom[2]->cpu_data(), diff_.cpu_data(), diff_.mutable_cpu_data());  // d := w_in * (b0 - b1)
     }
-  }
-  top[0]->mutable_cpu_data()[0] =
-      caffe_cpu_asum(count, errors_.cpu_data()) / bottom[0]->num();
+
+    for(int i = 0; i < count; i++) {
+        Dtype val = diff_.cpu_data()[i];
+        Dtype abs_val = fabs(val);
+        if (abs_val < 1.0 / sigma2_) {
+           errors_.mutable_cpu_data()[i] = 0.5 * val * val * sigma2_;
+        } 
+        else {
+           errors_.mutable_cpu_data()[i] = abs_val - 0.5 / sigma2_;
+        }
+    }
+    if (has_weights_) {
+        // apply "outside" weights
+        caffe_mul(count, bottom[3]->cpu_data(), errors_.cpu_data(), errors_.mutable_cpu_data());  // d := w_out * SmoothL1(w_in * (b0 - b1))
+    }
+
+    Dtype loss = caffe_cpu_dot(count, ones_.cpu_data(), errors_.cpu_data());
+    top[0]->mutable_cpu_data()[0] = loss / bottom[0]->num();
 }
 
+
 template <typename Dtype>
-void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
-    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  int count = diff_.count();
-  Dtype* diff_data = diff_.mutable_cpu_data();
-  for (int i = 0; i < count; ++i) {
-    Dtype val = diff_data[i];
-    // f'(x) = x         if |x| < 1
-    //       = sign(x)   otherwise
-    if (fabs(val) < 1.) {
-      diff_data[i] = val;
-    } else {
-      diff_data[i] = (Dtype(0) < val) - (val < Dtype(0));
+void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) 
+{
+    // after forwards, diff_ holds w_in * (b0 - b1)
+    int count = diff_.count();
+    
+    for (int i = 0; i < count; i++) {
+        // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
+        //       = sign(x)                   otherwise
+        Dtype val = diff_.cpu_data()[i];
+        Dtype abs_val = fabs(val);
+        if (abs_val < 1.0 / sigma2_) {
+          diff_.mutable_cpu_data()[i] = sigma2_ * val;
+        } 
+        else {
+          diff_.mutable_cpu_data()[i] = (Dtype(0) < val) - (val < Dtype(0));
+        }
     }
-  }
-  for (int i = 0; i < 2; ++i) {
-    if (propagate_down[i]) {
-      const Dtype sign = (i == 0) ? 1 : -1;
-      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
-      caffe_cpu_axpby(
-          bottom[i]->count(),               // count
-          alpha,                            // alpha
-          diff_.cpu_data(),                 // a
-          Dtype(0),                         // beta
-          bottom[i]->mutable_cpu_diff());   // b
+
+    for (int i = 0; i < 2; ++i) {
+        if (propagate_down[i])  {
+            const Dtype sign = (i == 0) ? 1 : -1;
+            const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
+            caffe_cpu_axpby(
+              count,                           // count
+              alpha,                           // alpha
+              diff_.cpu_data(),                // x
+              Dtype(0),                        // beta
+              bottom[i]->mutable_cpu_diff());  // y
+            if (has_weights_)  {
+                // Scale by "inside" weight
+                caffe_mul(
+                    count,
+                    bottom[2]->cpu_data(),
+                    bottom[i]->cpu_diff(),
+                    bottom[i]->mutable_cpu_diff());
+                // Scale by "outside" weight
+                caffe_mul(
+                    count,
+                    bottom[3]->cpu_data(),
+                    bottom[i]->cpu_diff(),
+                    bottom[i]->mutable_cpu_diff());
+            }
+        }
     }
-  }
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/layers/smooth_L1_loss_layer.cu b/src/caffe/layers/smooth_L1_loss_layer.cu
index 1dbf9f4dc..eca0e2b59 100644
--- a/src/caffe/layers/smooth_L1_loss_layer.cu
+++ b/src/caffe/layers/smooth_L1_loss_layer.cu
@@ -9,21 +9,21 @@
 #include <vector>
 
 #include "caffe/layers/smooth_L1_loss_layer.hpp"
-#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
 template <typename Dtype>
-__global__ void SmoothL1Forward(const int n, const Dtype* in, Dtype* out) {
-  // f(x) = 0.5 * x^2    if |x| < 1
-  //        |x| - 0.5    otherwise
+__global__ void SmoothL1Forward(const int n, const Dtype* in, Dtype* out,
+    Dtype sigma2) {
+  // f(x) = 0.5 * (sigma * x)^2          if |x| < 1 / sigma / sigma
+  //        |x| - 0.5 / sigma / sigma    otherwise
   CUDA_KERNEL_LOOP(index, n) {
     Dtype val = in[index];
     Dtype abs_val = abs(val);
-    if (abs_val < 1) {
-      out[index] = 0.5 * val * val;
+    if (abs_val < 1.0 / sigma2) {
+      out[index] = 0.5 * val * val * sigma2;
     } else {
-      out[index] = abs_val - 0.5;
+      out[index] = abs_val - 0.5 / sigma2;
     }
   }
 }
@@ -38,31 +38,43 @@ void SmoothL1LossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
       bottom[1]->gpu_data(),
       diff_.mutable_gpu_data());    // d := b0 - b1
   if (has_weights_) {
+    // apply "inside" weights
     caffe_gpu_mul(
         count,
         bottom[2]->gpu_data(),
         diff_.gpu_data(),
-        diff_.mutable_gpu_data());  // d := w * (b0 - b1)
+        diff_.mutable_gpu_data());  // d := w_in * (b0 - b1)
   }
   // NOLINT_NEXT_LINE(whitespace/operators)
-  SmoothL1Forward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, diff_.gpu_data(), errors_.mutable_gpu_data());
+  SmoothL1Forward<Dtype><<<CAFFE_GET_BLOCKS(count),
+    CAFFE_CUDA_NUM_THREADS>>>(
+      count, diff_.gpu_data(), errors_.mutable_gpu_data(), sigma2_);
   CUDA_POST_KERNEL_CHECK;
 
+  if (has_weights_) {
+    // apply "outside" weights
+    caffe_gpu_mul(
+        count,
+        bottom[3]->gpu_data(),
+        errors_.gpu_data(),
+        errors_.mutable_gpu_data());  // d := w_out * SmoothL1(w_in * (b0 - b1))
+  }
+
   Dtype loss;
-  caffe_gpu_asum(count, errors_.gpu_data(), &loss);
+  caffe_gpu_dot(count, ones_.gpu_data(), errors_.gpu_data(), &loss);
   top[0]->mutable_cpu_data()[0] = loss / bottom[0]->num();
 }
 
 template <typename Dtype>
-__global__ void SmoothL1Backward(const int n, const Dtype* in, Dtype* out) {
-  // f'(x) = x         if |x| < 1
-  //       = sign(x)   otherwise
+__global__ void SmoothL1Backward(const int n, const Dtype* in, Dtype* out,
+    Dtype sigma2) {
+  // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
+  //       = sign(x)                   otherwise
   CUDA_KERNEL_LOOP(index, n) {
     Dtype val = in[index];
     Dtype abs_val = abs(val);
-    if (abs_val < 1) {
-      out[index] = val;
+    if (abs_val < 1.0 / sigma2) {
+      out[index] = sigma2 * val;
     } else {
       out[index] = (Dtype(0) < val) - (val < Dtype(0));
     }
@@ -72,21 +84,37 @@ __global__ void SmoothL1Backward(const int n, const Dtype* in, Dtype* out) {
 template <typename Dtype>
 void SmoothL1LossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  // after forwards, diff_ holds w_in * (b0 - b1)
   int count = diff_.count();
   // NOLINT_NEXT_LINE(whitespace/operators)
-  SmoothL1Backward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
-      count, diff_.gpu_data(), diff_.mutable_gpu_data());
+  SmoothL1Backward<Dtype><<<CAFFE_GET_BLOCKS(count),
+    CAFFE_CUDA_NUM_THREADS >>>(
+      count, diff_.gpu_data(), diff_.mutable_gpu_data(), sigma2_);
   CUDA_POST_KERNEL_CHECK;
   for (int i = 0; i < 2; ++i) {
     if (propagate_down[i]) {
       const Dtype sign = (i == 0) ? 1 : -1;
       const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
       caffe_gpu_axpby(
-          bottom[i]->count(),              // count
+          count,                           // count
           alpha,                           // alpha
           diff_.gpu_data(),                // x
           Dtype(0),                        // beta
           bottom[i]->mutable_gpu_diff());  // y
+      if (has_weights_) {
+        // Scale by "inside" weight
+        caffe_gpu_mul(
+            count,
+            bottom[2]->gpu_data(),
+            bottom[i]->gpu_diff(),
+            bottom[i]->mutable_gpu_diff());
+        // Scale by "outside" weight
+        caffe_gpu_mul(
+            count,
+            bottom[3]->gpu_data(),
+            bottom[i]->gpu_diff(),
+            bottom[i]->mutable_gpu_diff());
+      }
     }
   }
 }
diff --git a/src/caffe/layers/smooth_L1_loss_ohem_layer.cpp b/src/caffe/layers/smooth_L1_loss_ohem_layer.cpp
new file mode 100644
index 000000000..e0ec66f4b
--- /dev/null
+++ b/src/caffe/layers/smooth_L1_loss_ohem_layer.cpp
@@ -0,0 +1,242 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+#include <cfloat>
+
+
+#include "caffe/layers/smooth_L1_loss_ohem_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SmoothL1LossOHEMLayer<Dtype>::LayerSetUp(
+  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  has_weights_ = (bottom.size() == 3);
+
+  if (!this->layer_param_.loss_param().has_normalization() &&
+    this->layer_param_.loss_param().has_normalize()) {
+    normalization_ = this->layer_param_.loss_param().normalize() ?
+    LossParameter_NormalizationMode_VALID :
+    LossParameter_NormalizationMode_BATCH_SIZE;
+  } else {
+    normalization_ = this->layer_param_.loss_param().normalization();
+  }
+}
+
+template <typename Dtype>
+void SmoothL1LossOHEMLayer<Dtype>::Reshape(
+  const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  LossLayer<Dtype>::Reshape(bottom, top);
+  CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
+  CHECK_EQ(bottom[0]->height(), bottom[1]->height());
+  CHECK_EQ(bottom[0]->width(), bottom[1]->width());
+  if (has_weights_) {
+    CHECK_EQ(bottom[0]->channels(), bottom[2]->channels());
+    CHECK_EQ(bottom[0]->height(), bottom[2]->height());
+    CHECK_EQ(bottom[0]->width(), bottom[2]->width());
+  }
+
+  outer_num_ = bottom[0]->num();
+  inner_num_ = bottom[0]->height() * bottom[0]->width();
+
+  diff_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+  errors_.Reshape(bottom[0]->num(), bottom[0]->channels(),
+      bottom[0]->height(), bottom[0]->width());
+
+  // top[2] stores per-instance loss, which takes the shape of N*1*H*W
+  if (top.size() >= 2) {
+    top[1]->Reshape(
+      bottom[0]->num(), 1, bottom[0]->height(), bottom[0]->width());
+  }
+}
+
+template <typename Dtype>
+Dtype SmoothL1LossOHEMLayer<Dtype>::get_normalizer(
+  LossParameter_NormalizationMode normalization_mode,
+  Dtype pre_fixed_normalizer) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+  case LossParameter_NormalizationMode_FULL:
+    normalizer = Dtype(outer_num_ * inner_num_);
+    break;
+  case LossParameter_NormalizationMode_VALID:
+    normalizer = Dtype(outer_num_ * inner_num_);
+    break;
+  case LossParameter_NormalizationMode_BATCH_SIZE:
+    normalizer = Dtype(outer_num_);
+    break;
+  case LossParameter_NormalizationMode_PRE_FIXED:
+    normalizer = pre_fixed_normalizer;
+    break;
+  case LossParameter_NormalizationMode_NONE:
+    normalizer = Dtype(1);
+    break;
+  default:
+    LOG(FATAL) << "Unknown normalization mode: "
+      << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
+}
+
+template <typename Dtype>
+void SmoothL1LossOHEMLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+    int count = bottom[0]->count();
+  
+    caffe_sub(
+      count,
+      bottom[0]->cpu_data(),
+      bottom[1]->cpu_data(),
+      diff_.mutable_cpu_data());    // d := b0 - b1
+    if (has_weights_) {
+      caffe_mul(
+        count,
+        bottom[2]->cpu_data(),
+        diff_.cpu_data(),
+        diff_.mutable_cpu_data());  // d := w * (b0 - b1)
+    }
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int index = 0; index < count; index++) {
+      Dtype val = diff_.cpu_data()[index];
+      Dtype abs_val = abs(val);
+      if (abs_val < 1) {
+        errors_.mutable_cpu_data()[index] = 0.5 * val * val;
+      } else {
+        errors_.mutable_cpu_data()[index] = abs_val - 0.5;
+      }
+    }
+
+    Dtype loss = caffe_cpu_asum(count, errors_.cpu_data());
+
+    Dtype pre_fixed_normalizer =
+      this->layer_param_.loss_param().pre_fixed_normalizer();
+    top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_,
+      pre_fixed_normalizer);
+
+    // Output per-instance loss
+    if (top.size() >= 2) {
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2)
+#endif
+        for (int i = 0; i < outer_num_; ++i) {
+            for (int j = 0; j < inner_num_; j++) {
+                Dtype sum = 0;
+                for (int c = 0; c < bottom[0]->channels(); ++c) {
+                    sum += errors_.cpu_data()[(i * bottom[0]->channels() + c) * inner_num_ + j];
+                }
+                top[1]->mutable_cpu_data()[i * inner_num_ + j] = sum;
+            }
+        }
+    }
+}
+
+#if 0
+template <typename Dtype>
+__global__ void kernel_channel_sum(const int num, const int channels,
+  const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
+  CUDA_KERNEL_LOOP(index, num * spatial_dim) {
+	int n = index / spatial_dim;
+	int s = index % spatial_dim;
+	Dtype sum = 0;
+	for (int c = 0; c < channels; ++c) {
+	  sum += data[(n * channels + c) * spatial_dim + s];
+	}
+	channel_sum[index] = sum;
+  }
+}
+#endif
+
+template <typename Dtype>
+void SmoothL1LossOHEMLayer<Dtype>::Backward_cpu(
+  const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+  const vector<Blob<Dtype>*>& bottom) {
+    int count = diff_.count();
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int index = 0; index < count; index++) {
+      Dtype val = diff_.cpu_data()[index];
+      Dtype abs_val = abs(val);
+      if (abs_val < 1) {
+        diff_.mutable_cpu_data()[index] = val;
+      } else {
+        diff_.mutable_cpu_data()[index] = (Dtype(0) < val) - (val < Dtype(0));
+      }
+    }
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int i = 0; i < 2; ++i) {
+      if (propagate_down[i]) {
+        const Dtype sign = (i == 0) ? 1 : -1;
+
+        Dtype pre_fixed_normalizer =
+          this->layer_param_.loss_param().pre_fixed_normalizer();
+        Dtype normalizer = get_normalizer(normalization_, pre_fixed_normalizer);
+        Dtype alpha = sign * top[0]->cpu_diff()[0] / normalizer;
+
+        caffe_cpu_axpby(
+          bottom[i]->count(),              // count
+          alpha,                           // alpha
+          diff_.cpu_data(),                // x
+          Dtype(0),                        // beta
+          bottom[i]->mutable_cpu_diff());  // y
+      	}
+    }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(SmoothL1LossOHEMLayer);
+#endif
+
+INSTANTIATE_CLASS(SmoothL1LossOHEMLayer);
+REGISTER_LAYER_CLASS(SmoothL1LossOHEM);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/smooth_L1_loss_ohem_layer.cu b/src/caffe/layers/smooth_L1_loss_ohem_layer.cu
new file mode 100644
index 000000000..776106d9e
--- /dev/null
+++ b/src/caffe/layers/smooth_L1_loss_ohem_layer.cu
@@ -0,0 +1,164 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "thrust/device_vector.h"
+
+#include "caffe/layers/smooth_L1_loss_ohem_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+  template <typename Dtype>
+  __global__ void SmoothL1ForwardGPU(const int n, const Dtype* in, Dtype* out) {
+    // f(x) = 0.5 * x^2    if |x| < 1
+    //        |x| - 0.5    otherwise
+    CUDA_KERNEL_LOOP(index, n) {
+      Dtype val = in[index];
+      Dtype abs_val = abs(val);
+      if (abs_val < 1) {
+        out[index] = 0.5 * val * val;
+      } else {
+        out[index] = abs_val - 0.5;
+      }
+    }
+  }
+
+  template <typename Dtype>
+  __global__ void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, const Dtype* data, Dtype* channel_sum) {
+    CUDA_KERNEL_LOOP(index, num * spatial_dim) {
+      int n = index / spatial_dim;
+      int s = index % spatial_dim;
+      Dtype sum = 0;
+      for (int c = 0; c < channels; ++c) {
+        sum += data[(n * channels + c) * spatial_dim + s];
+      }
+      channel_sum[index] = sum;
+    }
+  }
+
+  template <typename Dtype>
+  void SmoothL1LossOHEMLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+    int count = bottom[0]->count();
+    caffe_gpu_sub(
+      count,
+      bottom[0]->gpu_data(),
+      bottom[1]->gpu_data(),
+      diff_.mutable_gpu_data());    // d := b0 - b1
+    if (has_weights_) {
+      caffe_gpu_mul(
+        count,
+        bottom[2]->gpu_data(),
+        diff_.gpu_data(),
+        diff_.mutable_gpu_data());  // d := w * (b0 - b1)
+    }
+    SmoothL1ForwardGPU<Dtype> << <CAFFE_GET_BLOCKS(count),
+      CAFFE_CUDA_NUM_THREADS >> >(count, diff_.gpu_data(),
+      errors_.mutable_gpu_data());
+    CUDA_POST_KERNEL_CHECK;
+
+
+    Dtype loss;
+    caffe_gpu_asum(count, errors_.gpu_data(), &loss);
+    int spatial_dim = diff_.height() * diff_.width();
+
+    Dtype pre_fixed_normalizer =
+      this->layer_param_.loss_param().pre_fixed_normalizer();
+    top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_,
+      pre_fixed_normalizer);
+
+    // Output per-instance loss
+    if (top.size() >= 2) {
+      kernel_channel_sum<Dtype> << <CAFFE_GET_BLOCKS(top[1]->count()),
+        CAFFE_CUDA_NUM_THREADS >> > (outer_num_, bottom[0]->channels(),
+        inner_num_, errors_.gpu_data(), top[1]->mutable_gpu_data());
+    }
+  }
+
+  template <typename Dtype>
+  __global__ void SmoothL1BackwardGPU(
+    const int n, const Dtype* in, Dtype* out) {
+    // f'(x) = x         if |x| < 1
+    //       = sign(x)   otherwise
+    CUDA_KERNEL_LOOP(index, n) {
+      Dtype val = in[index];
+      Dtype abs_val = abs(val);
+      if (abs_val < 1) {
+        out[index] = val;
+      } else {
+        out[index] = (Dtype(0) < val) - (val < Dtype(0));
+      }
+    }
+  }
+
+  template <typename Dtype>
+  void SmoothL1LossOHEMLayer<Dtype>::Backward_gpu(
+    const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+    int count = diff_.count();
+    SmoothL1BackwardGPU<Dtype> << <CAFFE_GET_BLOCKS(count),
+      CAFFE_CUDA_NUM_THREADS >> >(count, diff_.gpu_data(),
+      diff_.mutable_gpu_data());
+    CUDA_POST_KERNEL_CHECK;
+    for (int i = 0; i < 2; ++i) {
+      if (propagate_down[i]) {
+        const Dtype sign = (i == 0) ? 1 : -1;
+        int spatial_dim = diff_.height() * diff_.width();
+
+        Dtype pre_fixed_normalizer =
+          this->layer_param_.loss_param().pre_fixed_normalizer();
+        Dtype normalizer = get_normalizer(normalization_, pre_fixed_normalizer);
+        Dtype alpha = sign * top[0]->cpu_diff()[0] / normalizer;
+
+        caffe_gpu_axpby(
+          bottom[i]->count(),              // count
+          alpha,                           // alpha
+          diff_.gpu_data(),                // x
+          Dtype(0),                        // beta
+          bottom[i]->mutable_gpu_diff());  // y
+      }
+    }
+  }
+
+  INSTANTIATE_LAYER_GPU_FUNCS(SmoothL1LossOHEMLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/softmax_loss_ohem_layer.cpp b/src/caffe/layers/softmax_loss_ohem_layer.cpp
new file mode 100644
index 000000000..a57545274
--- /dev/null
+++ b/src/caffe/layers/softmax_loss_ohem_layer.cpp
@@ -0,0 +1,220 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layers/softmax_loss_ohem_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SoftmaxWithLossOHEMLayer<Dtype>::LayerSetUp(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  LossLayer<Dtype>::LayerSetUp(bottom, top);
+  LayerParameter softmax_param(this->layer_param_);
+  // Fix a bug which occurs with more than one output
+  softmax_param.clear_loss_weight();
+  softmax_param.set_type("Softmax");
+  softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_param);
+  softmax_bottom_vec_.clear();
+  softmax_bottom_vec_.push_back(bottom[0]);
+  softmax_top_vec_.clear();
+  softmax_top_vec_.push_back(&prob_);
+  softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
+
+  has_ignore_label_ =
+    this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
+  if (!this->layer_param_.loss_param().has_normalization() &&
+      this->layer_param_.loss_param().has_normalize()) {
+    normalization_ = this->layer_param_.loss_param().normalize() ?
+                     LossParameter_NormalizationMode_VALID :
+                     LossParameter_NormalizationMode_BATCH_SIZE;
+  } else {
+    normalization_ = this->layer_param_.loss_param().normalization();
+  }
+}
+
+template <typename Dtype>
+void SoftmaxWithLossOHEMLayer<Dtype>::Reshape(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  LossLayer<Dtype>::Reshape(bottom, top);
+  softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
+  softmax_axis_ =
+      bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis());
+  outer_num_ = bottom[0]->count(0, softmax_axis_);
+  inner_num_ = bottom[0]->count(softmax_axis_ + 1);
+  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+      << "Number of labels must match number of predictions; "
+      << "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), "
+      << "label count (number of labels) must be N*H*W, "
+      << "with integer values in {0, 1, ..., C-1}.";
+  if (top.size() >= 2) {
+    // softmax output
+    top[1]->ReshapeLike(*bottom[0]);
+  }
+
+  // top[2] stores per-instance loss, which takes the shape of N*1*H*W
+  if (top.size() >= 3) {
+    top[2]->ReshapeLike(*bottom[1]);
+  }
+}
+
+template <typename Dtype>
+Dtype SoftmaxWithLossOHEMLayer<Dtype>::get_normalizer(
+    LossParameter_NormalizationMode normalization_mode, int valid_count) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+    case LossParameter_NormalizationMode_FULL:
+      normalizer = Dtype(outer_num_ * inner_num_);
+      break;
+    case LossParameter_NormalizationMode_VALID:
+      if (valid_count == -1) {
+        normalizer = Dtype(outer_num_ * inner_num_);
+      } else {
+        normalizer = Dtype(valid_count);
+      }
+      break;
+    case LossParameter_NormalizationMode_BATCH_SIZE:
+      normalizer = Dtype(outer_num_);
+      break;
+    case LossParameter_NormalizationMode_NONE:
+      normalizer = Dtype(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown normalization mode: "
+          << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
+}
+
+template <typename Dtype>
+void SoftmaxWithLossOHEMLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+
+  // The forward pass computes the softmax prob values.
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.cpu_data();
+  const Dtype* label = bottom[1]->cpu_data();
+  int dim = prob_.count() / outer_num_;
+  Dtype* loss_data = bottom[0]->mutable_cpu_diff();
+  int count = 0;
+  Dtype loss = 0;
+
+  for (int i = 0; i < outer_num_; ++i) {
+    for (int j = 0; j < inner_num_; j++) {
+      const int label_value = static_cast<int>(label[i * inner_num_ + j]);
+      if (has_ignore_label_ && label_value == ignore_label_) {
+        continue;
+      }
+      DCHECK_GE(label_value, 0);
+      DCHECK_LT(label_value, prob_.shape(softmax_axis_));
+
+      // loss -= log(std::max(prob_data[i * dim + label_value * inner_num_ + j],
+      //                     Dtype(FLT_MIN)));
+      loss_data[i*inner_num_+j] = -log(std::max(prob_data[i * dim + label_value * inner_num_ + j],
+                           Dtype(FLT_MIN)));
+      ++count;
+    }
+  }
+  loss = caffe_cpu_asum(count, loss_data);
+  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
+
+  if (top.size() >= 3) {
+    // Output per-instance loss
+    caffe_copy(top[2]->count(), loss_data,
+      top[2]->mutable_cpu_data());
+  }
+
+  // Fix a bug, which happens when propagate_down[0] = false in backward
+  caffe_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_cpu_diff());
+}
+
+template <typename Dtype>
+void SoftmaxWithLossOHEMLayer<Dtype>::Backward_cpu(
+  const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+  const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+               << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const Dtype* prob_data = prob_.cpu_data();
+    caffe_copy(prob_.count(), prob_data, bottom_diff);
+    const Dtype* label = bottom[1]->cpu_data();
+    int dim = prob_.count() / outer_num_;
+    int count = 0;
+    for (int i = 0; i < outer_num_; ++i) {
+      for (int j = 0; j < inner_num_; ++j) {
+        const int label_value = static_cast<int>(label[i * inner_num_ + j]);
+        if (has_ignore_label_ && label_value == ignore_label_) {
+          for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
+            bottom_diff[i * dim + c * inner_num_ + j] = 0;
+          }
+        } else {
+          bottom_diff[i * dim + label_value * inner_num_ + j] -= 1;
+          ++count;
+        }
+      }
+    }
+    // Scale gradient
+    Dtype loss_weight = top[0]->cpu_diff()[0] /
+                        get_normalizer(normalization_, count);
+    caffe_scal(prob_.count(), loss_weight, bottom_diff);
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(SoftmaxWithLossOHEMLayer);
+#endif
+
+INSTANTIATE_CLASS(SoftmaxWithLossOHEMLayer);
+REGISTER_LAYER_CLASS(SoftmaxWithLossOHEM);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/softmax_loss_ohem_layer.cu b/src/caffe/layers/softmax_loss_ohem_layer.cu
new file mode 100644
index 000000000..1acf9bd4f
--- /dev/null
+++ b/src/caffe/layers/softmax_loss_ohem_layer.cu
@@ -0,0 +1,175 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include <algorithm>
+#include <cfloat>
+#include <vector>
+
+#include "caffe/layers/softmax_loss_ohem_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void SoftmaxLossForwardGPU(const int nthreads,
+          const Dtype* prob_data, const Dtype* label, Dtype* loss,
+          const int num, const int dim, const int spatial_dim,
+          const bool has_ignore_label_, const int ignore_label_,
+          Dtype* counts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      loss[index] = 0;
+      counts[index] = 0;
+    } else {
+      loss[index] = -log(max(prob_data[n * dim + label_value * spatial_dim + s],
+                      Dtype(FLT_MIN)));
+      counts[index] = 1;
+    }
+  }
+}
+
+template <typename Dtype>
+void SoftmaxWithLossOHEMLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.gpu_data();
+  const Dtype* label = bottom[1]->gpu_data();
+  const int dim = prob_.count() / outer_num_;
+  const int nthreads = outer_num_ * inner_num_;
+  // Since this memory is not used for anything until it is overwritten
+  // on the backward pass, we use it here to avoid having to allocate new GPU
+  // memory to accumulate intermediate results in the kernel.
+  Dtype* loss_data = bottom[0]->mutable_gpu_diff();
+  // Similarly, this memory is never used elsewhere, and thus we can use it
+  // to avoid having to allocate additional GPU memory.
+  Dtype* counts = prob_.mutable_gpu_diff();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  SoftmaxLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+      CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
+      outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+  Dtype loss;
+  caffe_gpu_asum(nthreads, loss_data, &loss);
+  Dtype valid_count = -1;
+  // Only launch another CUDA kernel if we actually need the count of valid
+  // outputs.
+  if (normalization_ == LossParameter_NormalizationMode_VALID &&
+      has_ignore_label_) {
+    caffe_gpu_asum(nthreads, counts, &valid_count);
+  }
+  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_,
+                                                        valid_count);
+  if (top.size() >= 2) {
+    top[1]->ShareData(prob_);
+  }
+  if (top.size() >= 3) {
+    // Output per-instance loss
+    caffe_gpu_memcpy(top[2]->count() * sizeof(Dtype), loss_data,
+      top[2]->mutable_gpu_data());
+  }
+
+  // Fix a bug, which happens when propagate_down[0] = false in backward
+  caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff());
+}
+
+template <typename Dtype>
+__global__ void SoftmaxLossBackwardGPU(const int nthreads, const Dtype* top,
+          const Dtype* label, Dtype* bottom_diff, const int num, const int dim,
+          const int spatial_dim, const bool has_ignore_label_,
+          const int ignore_label_, Dtype* counts) {
+  const int channels = dim / spatial_dim;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      for (int c = 0; c < channels; ++c) {
+        bottom_diff[n * dim + c * spatial_dim + s] = 0;
+      }
+      counts[index] = 0;
+    } else {
+      bottom_diff[n * dim + label_value * spatial_dim + s] -= 1;
+      counts[index] = 1;
+    }
+  }
+}
+
+template <typename Dtype>
+void SoftmaxWithLossOHEMLayer<Dtype>::Backward_gpu(
+  const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,
+  const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {
+    LOG(FATAL) << this->type()
+               << " Layer cannot backpropagate to label inputs.";
+  }
+  if (propagate_down[0]) {
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const Dtype* prob_data = prob_.gpu_data();
+    const Dtype* top_data = top[0]->gpu_data();
+    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
+    const Dtype* label = bottom[1]->gpu_data();
+    const int dim = prob_.count() / outer_num_;
+    const int nthreads = outer_num_ * inner_num_;
+    // Since this memory is never used for anything else,
+    // we use to to avoid allocating new GPU memory.
+    Dtype* counts = prob_.mutable_gpu_diff();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+        CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
+        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
+
+    Dtype valid_count = -1;
+    // Only launch another CUDA kernel if we actually need the count of valid
+    // outputs.
+    if (normalization_ == LossParameter_NormalizationMode_VALID &&
+        has_ignore_label_) {
+      caffe_gpu_asum(nthreads, counts, &valid_count);
+    }
+    const Dtype loss_weight = top[0]->cpu_diff()[0] /
+                              get_normalizer(normalization_, valid_count);
+    caffe_gpu_scal(prob_.count(), loss_weight , bottom_diff);
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(SoftmaxWithLossOHEMLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/spatial_dropout_layer.cpp b/src/caffe/layers/spatial_dropout_layer.cpp
new file mode 100644
index 000000000..ef79760f2
--- /dev/null
+++ b/src/caffe/layers/spatial_dropout_layer.cpp
@@ -0,0 +1,136 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <vector>
+
+#include "caffe/layers/spatial_dropout_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+void SpatialDropoutLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
+  threshold_ = this->layer_param_.dropout_param().dropout_ratio();
+  DCHECK(threshold_ > 0.);
+  DCHECK(threshold_ < 1.);
+  scale_ = 1. / (1. - threshold_);
+  uint_thres_ = static_cast<unsigned int>(UINT_MAX * threshold_);
+}
+
+template <typename Dtype>
+void SpatialDropoutLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  NeuronLayer<Dtype>::Reshape(bottom, top);
+  // Set up the cache for random number generation
+  // ReshapeLike does not work because rand_vec_ is of Dtype uint
+  
+  rand_vec_.Reshape(bottom[0]->num(), bottom[0]->channels(), 1, 1);
+}
+
+template <typename Dtype>
+void SpatialDropoutLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  unsigned int* mask = rand_vec_.mutable_cpu_data();
+  const int num = bottom[0]->num();
+  const int channel = bottom[0]->channels();
+  const int img_size = bottom[0]->height() * bottom[0]->width();
+  if (this->phase_ == TRAIN) {
+    // Create random numbers
+    caffe_rng_bernoulli(num * channel, 1. - threshold_, mask);
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2)
+#endif
+    for (int i = 0; i < num; ++i) {
+      for (int j = 0; j < channel; j++) {
+        int offset = (i * channel + j) * img_size;
+        if (mask[i * channel + j] == 1) {
+          caffe_cpu_axpby(img_size, scale_, bottom_data + offset, (Dtype)0, top_data + offset);
+        } else {
+          caffe_set(img_size, (Dtype)0, top_data + offset);
+        }
+      }
+    }
+  } else {
+    caffe_copy(bottom[0]->count(), bottom_data, top_data);
+  }
+}
+
+template <typename Dtype>
+void SpatialDropoutLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int num = bottom[0]->num();
+    const int channel = bottom[0]->channels();
+    const int img_size = bottom[0]->height() * bottom[0]->width();
+    if (this->phase_ == TRAIN) {
+      const unsigned int* mask = rand_vec_.cpu_data();
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2)
+#endif
+      for (int i = 0; i < num; ++i) {
+        for (int j = 0; j < channel; j++) {
+          int offset = (i * channel + j) * img_size;
+          if (mask[i * channel + j] == 1) {
+            caffe_cpu_axpby(img_size, scale_, top_diff + offset, (Dtype)0, bottom_diff + offset);
+          } else {
+            caffe_set(img_size, (Dtype)0, bottom_diff + offset);
+          }
+        }
+      }
+    } else {
+      caffe_copy(bottom[0]->count(), top_diff, bottom_diff);
+    }
+  }
+}
+
+
+#ifdef CPU_ONLY
+STUB_GPU(SpatialDropoutLayer);
+#endif
+
+INSTANTIATE_CLASS(SpatialDropoutLayer);
+REGISTER_LAYER_CLASS(SpatialDropout);
+
+}  // namespace caffe
diff --git a/src/caffe/mkldnn_memory.cpp b/src/caffe/mkldnn_memory.cpp
index c53cff7ff..4bc2aa13b 100644
--- a/src/caffe/mkldnn_memory.cpp
+++ b/src/caffe/mkldnn_memory.cpp
@@ -478,6 +478,15 @@ shared_ptr<memory::desc> MKLDNNMemoryDescriptor<Dtype, is_diff>::get_memory_desc
     return desc;
 }
 
+template <typename Dtype, bool is_diff>
+size_t MKLDNNMemoryDescriptor<Dtype, is_diff>::get_memory_count() {
+  if (this->conversion_needed()) {
+    return this->prv_count();
+  } else {
+    return this->_blob->count();
+  }
+}
+
 template <typename Dtype, bool is_diff>
 shared_ptr<MKLDNNMemoryDescriptor<Dtype, is_diff> > get_mkldnn_prv_descriptor(Blob<Dtype>* blob)
 {
diff --git a/src/caffe/multinode/apply_mn_param.cpp b/src/caffe/multinode/apply_mn_param.cpp
index 29e32079b..1051c0913 100644
--- a/src/caffe/multinode/apply_mn_param.cpp
+++ b/src/caffe/multinode/apply_mn_param.cpp
@@ -152,6 +152,10 @@ void ApplyMultinodeParams(const NetParameter& param,
   //         insert activation layers if needed
   param_with_mn->CopyFrom(param);
   param_with_mn->clear_layer();
+  if (mn::is_param_server()) {
+    // do not insert activation layers when loaded on param servers
+    blob_param_map.clear();
+  }
   for (int i = 0; i < param.layer_size(); i++) {
     const LayerParameter& orig_layer_param = param.layer(i);
     map<int, string> updated_blob_idx_to_name;
@@ -186,7 +190,7 @@ void ApplyMultinodeParams(const NetParameter& param,
     int num_nodes = mn_layer_param.num_nodes();
     int model_parts = mn_layer_param.model_parts();
     mn::GetCanonicalMnParam(num_nodes, model_parts);
-    if (model_parts > 1) {
+    if (model_parts > 1 && !mn::is_param_server()) {
       // TODO: support transpose
       // TODO: support undividible num_output
       if (layer_param->type() == "Convolution") {
diff --git a/src/caffe/multinode/async_param_server.cpp b/src/caffe/multinode/async_param_server.cpp
new file mode 100644
index 000000000..a403bafb5
--- /dev/null
+++ b/src/caffe/multinode/async_param_server.cpp
@@ -0,0 +1,277 @@
+/*
+All modification made by Intel Corporation: © 2017 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+// Initial implementation from Jian Zhang and Ioannis Mitliagkas, Stanford, on Oct 2 2016
+// Refer to the paper: https://arxiv.org/pdf/1708.05256.pdf
+
+#ifdef USE_MLSL
+
+#include <cstdlib>
+#include <climits>
+#include <boost/make_shared.hpp>
+#include <thread>
+
+#include "caffe/caffe.hpp"
+#include "caffe/multinode/async_param_server.hpp"
+
+namespace caffe {
+  namespace mn {
+
+    using std::make_pair;
+
+    template <typename Dtype>
+    AsyncParamServer<Dtype>::AsyncParamServer(boost::shared_ptr<Solver<Dtype> > solver) :
+      recv_tasks_iter_(0), 
+      solver_(solver),
+      send_cnt_(0), update_cnt_(0) {
+
+      // setup the mpi buffers and recv task vector
+      int mpi_rank = get_node_rank();
+      shared_ptr<Net<Dtype>> net = solver_->net();
+      const vector<Blob<Dtype> *> &net_params = net->learnable_params();
+      
+      for (int i = 0; i < get_num_groups(); i++) {
+        int root_rank = get_group_root_rank(i);
+        //iterate over layers and skip the ones without params
+        for (int j = 0; j < net->layers().size(); j++) {
+          shared_ptr<Layer<Dtype>> layer = net->layers()[j];
+          //skip layers w/o parameters
+          if ((layer->layerOp == nullptr) || !(layer->layerOp->HasParameterSets())) {
+            continue;
+          }
+          const MultinodeLayerParameter & mn_layer_param = layer->layer_param().multinode();
+          int model_parts = mn_layer_param.model_parts();
+          int mn_num_nodes = mn_layer_param.num_nodes();
+          GetCanonicalMnParam(mn_num_nodes, model_parts);
+          vector<int> layer_param_ids = net->get_layer_learnable_param_ids(j);
+          for (int k = 0; k < layer_param_ids.size(); k++) {
+            int param_id = layer_param_ids[k];
+            if (!layer->ParamNeedReduce(k)) continue;
+            if (param_to_server_rank(j, param_id) != mpi_rank) continue;
+            Blob<Dtype> *blob = net_params[param_id];
+            // Setup buf for recv
+            Dtype* buf = (Dtype*)std::malloc(sizeof(Dtype) * blob->count());
+            recv_buf_[make_pair(root_rank, param_id)] = make_pair(buf, blob->count());
+            for (int part_id = 0; part_id < model_parts; part_id++) {
+              int part_root_rank = get_group_root_rank(i, part_id, model_parts);
+              int64_t part_offset = part_id * blob->count() / model_parts;
+              TaskRequest recv_task(part_root_rank, j, param_id, part_id, model_parts);
+              recv_tasks_.push_back(recv_task);
+              rank_layer_blob_to_vec_pos[make_pair(part_root_rank, param_id)] =
+                recv_tasks_.size() - 1;
+              MPI_Irecv(buf + part_offset, blob->count() / model_parts,
+                        DtypeToMPIDtype<Dtype>(), part_root_rank,
+                        recv_task.GetTag(), MPI_COMM_WORLD,
+                        &(recv_tasks_[recv_tasks_.size() - 1].mpi_request_));
+              async_iter_[make_pair(param_id, part_id)] = solver_->iter();
+            }
+            // Setup buf for send
+            buf = (Dtype*)std::malloc(sizeof(Dtype) * blob->count());
+            send_buf_[make_pair(root_rank, param_id)] = make_pair(buf, blob->count());
+          }
+        }
+      }
+      total_update_ = total_send_ = recv_tasks_.size() * (solver_->param().max_iter() - 1);
+    }
+
+    template <typename Dtype>
+    AsyncParamServer<Dtype>::~AsyncParamServer() {
+      // clean mpi buffers
+      shared_ptr<Net<Dtype>> net = solver_->net();
+      for (int i = 0; i < get_num_groups(); i++) {
+        int root_rank = get_group_root_rank(i);
+        for (int j = 0; j < net->layers().size(); j++) {
+          vector<int> layer_param_ids = net->get_layer_learnable_param_ids(j);
+          for (int k = 0; k < layer_param_ids.size(); k++) {
+            pair<int,int> key = make_pair(root_rank, layer_param_ids[k]);
+            if (send_buf_.find(key) != send_buf_.end()) {
+              std::free(send_buf_[key].first);
+            }
+            if (recv_buf_.find(key) != recv_buf_.end()) {
+              std::free(recv_buf_[key].first);
+            }
+          }
+        }
+      }
+    }
+
+    // TODO Jian how to get the correct iter number potentially get the version and set iter before update
+    template <typename Dtype>
+    void AsyncParamServer<Dtype>::ProcessUpdateTask() {
+      const vector<Blob<Dtype> *> &net_params = solver_->net()->learnable_params();
+      std::deque<TaskRequest> to_update;
+      update_queue_mutex_.lock();
+      to_update.swap(update_tasks_);
+      update_queue_mutex_.unlock();
+      while (!to_update.empty() ) {
+        TaskRequest task = to_update.front();
+        to_update.pop_front();
+
+        // copy to diff in solver
+        int root_rank = world_rank_to_root_rank(task.part_root_rank_);
+        Blob<Dtype>* blob = net_params[task.param_id_];
+        Dtype* solver_diff = blob->mutable_cpu_diff();
+        Dtype* mpi_buf = 
+          recv_buf_[make_pair(root_rank, task.param_id_)].first;
+        int64_t count = 
+          recv_buf_[make_pair(root_rank, task.param_id_)].second;
+        CHECK(count == blob->count() );
+        //copy MPI buffer to solver_diff
+        int64_t part_offset = task.part_id_ * count / task.num_parts_;
+        caffe_copy(count / task.num_parts_,
+                   mpi_buf + part_offset, solver_diff + part_offset);
+        // apply update
+        int blob_wise_iter = async_iter_[make_pair(task.param_id_, task.part_id_) ];
+        solver_->set_iter(blob_wise_iter);
+        // TODO: supports partial param update per model parts
+        solver_->ApplyUpdate(task.param_id_);
+
+        //clean up
+        solver_->net()->ClearParamDiffs(task.param_id_);
+        async_iter_[ make_pair(task.param_id_, task.part_id_) ] += 1;
+        update_cnt_ += 1;
+        
+        // copy model(data) in solver to mpi buffer
+        mpi_buf = send_buf_[make_pair(root_rank, task.param_id_)].first;
+        caffe_copy(count / task.num_parts_,
+                   blob->cpu_data() + part_offset, mpi_buf + part_offset);
+
+        //ship off
+        send_queue_mutex_.lock();
+        send_tasks_.push_back(task);
+        send_queue_mutex_.unlock();
+      }
+    }
+
+
+    template <typename Dtype>
+    void AsyncParamServer<Dtype>::ProcessSendTask() {
+      std::deque<TaskRequest> to_send;
+      send_queue_mutex_.lock();
+      to_send.swap(send_tasks_);
+      send_queue_mutex_.unlock();
+      std::vector<MPI_Request> send_request;
+      while (!to_send.empty() ) {
+        TaskRequest task = to_send.front();
+        to_send.pop_front();
+
+        int root_rank = world_rank_to_root_rank(task.part_root_rank_);
+        int param_id = task.param_id_;
+        int part_id = task.part_id_;
+        int tag = task.GetTag();
+
+        // start a new listening to wait for message from roots
+        Dtype* recv_ptr = recv_buf_[make_pair(root_rank, param_id)].first;
+        int count = recv_buf_[make_pair(root_rank, param_id)].second;
+        int64_t part_offset = part_id * count / task.num_parts_;
+        int vec_pos = rank_layer_blob_to_vec_pos[make_pair(task.part_root_rank_, param_id)];
+        MPI_Irecv(recv_ptr + part_offset, count / task.num_parts_, DtypeToMPIDtype<Dtype>(),
+                  task.part_root_rank_, tag, MPI_COMM_WORLD, &(recv_tasks_[vec_pos].mpi_request_) );
+        
+#ifdef DEBUG_ORDER_BCAST
+        DEBUG_INFO("PS send message for layer ") << layer_id << " to rank " << root_rank << " for tag " << tag << std::endl;
+#endif
+
+        //prepare the matching send now
+        std::pair<Dtype*, int64_t> buf = send_buf_[make_pair(root_rank, param_id)];
+        Dtype* send_ptr = buf.first;
+        // We do not need to care about the request. Because if the blocking recv
+        // has not finished on root, it will not start a new send task
+        // MPI_Request send_request;
+        send_request.push_back(MPI_Request() );
+        MPI_Isend(send_ptr + part_offset, count / task.num_parts_, DtypeToMPIDtype<Dtype>(),
+                  task.part_root_rank_, tag, MPI_COMM_WORLD, &(send_request.back() ) );
+
+        //increase sent count
+        send_cnt_ += 1;
+      }
+      if (send_request.size() != 0) {
+        MPI_Waitall(send_request.size(), &send_request[0], MPI_STATUSES_IGNORE);
+      }
+    }
+
+
+    template <typename Dtype>
+    void AsyncParamServer<Dtype>::ProcessRecvTask() {
+      int flag = 0;
+      for (int i = 0; i < recv_tasks_.size(); i++) {
+        if (recv_tasks_[recv_tasks_iter_].mpi_request_ != MPI_REQUEST_NULL) {
+          MPI_Test(&(recv_tasks_[recv_tasks_iter_].mpi_request_), &flag, MPI_STATUS_IGNORE);
+          if (flag) {
+            // currently no need to lock the solver buffer, as comp thread
+            // takes care of two copy operations.
+            update_queue_mutex_.lock();
+            update_tasks_.push_back(recv_tasks_[recv_tasks_iter_] );
+            update_queue_mutex_.unlock();
+          }
+        }
+        recv_tasks_iter_ = (recv_tasks_iter_ + 1) % recv_tasks_.size();
+        if (flag) return;
+      }
+    }
+
+
+    template <typename Dtype>
+    void AsyncParamServer<Dtype>::ComputeLoop() {
+      do {
+        ProcessUpdateTask();
+      } while(update_cnt_ < total_update_);
+    }
+
+
+    template <typename Dtype>
+    void AsyncParamServer<Dtype>::CommLoop() {
+      do {
+        ProcessSendTask();
+        ProcessRecvTask();
+      } while(send_cnt_ < total_send_);
+    }
+
+    template <typename Dtype>
+    void AsyncParamServer<Dtype>::Run() {
+      // spawn compute thread
+      std::thread compute_thread(&AsyncParamServer<Dtype>::ComputeLoop, this);
+      // spawn communication thread
+      CommLoop();
+      compute_thread.join();
+    }
+
+    INSTANTIATE_CLASS(AsyncParamServer);
+  } // end of namespace mn
+ 
+} // end of namespace caffe
+#endif
diff --git a/src/caffe/multinode/mlsl.cpp b/src/caffe/multinode/mlsl.cpp
index 8a0f772af..eb47c7a3e 100644
--- a/src/caffe/multinode/mlsl.cpp
+++ b/src/caffe/multinode/mlsl.cpp
@@ -43,6 +43,8 @@
 
 namespace caffe {
   namespace mn {
+    int nGroup = 1;
+    int nServer = 0;
     boost::mutex distrib_lock;
     std::map<std::pair<int,int>, boost::shared_ptr<Distribution>> *distrib_map;
 
@@ -53,6 +55,18 @@ namespace caffe {
           MLSL::Environment::GetEnv().Init(argc, argv);
           distrib_map =
             new std::map<std::pair<int,int>, boost::shared_ptr<Distribution>>();
+          if (use_param_server()) {
+            if (is_param_server()) {
+              // this is for paramter servers
+              MLSL::Environment::GetEnv().Configure("color=0");
+            }
+            else {
+              // this is for workers
+              int group_id = get_group_id();
+              std::string config_str = "color=" + std::to_string(group_id + 1);
+              MLSL::Environment::GetEnv().Configure(config_str.c_str());
+            }
+          }
         }
         ~initialize() {
           delete distrib_map;
@@ -61,30 +75,49 @@ namespace caffe {
       } __init{ argc, argv };
     }
     
-    shared_ptr<Distribution> create_distrib(
+    template<>  
+    MPI_Datatype DtypeToMPIDtype<float>() { return MPI_FLOAT; }
+
+    template<> 
+    MPI_Datatype DtypeToMPIDtype<double>() { return MPI_DOUBLE; }
+
+    template<>  
+    MLSL::DataType DtypeToMLSLDtype<float>() { return MLSL::DT_FLOAT; }
+
+    template<> 
+    MLSL::DataType DtypeToMLSLDtype<double>() { return MLSL::DT_DOUBLE; }
+
+    boost::shared_ptr<Distribution> create_distrib(
       int dataParts, int modelParts, int dataColor, int modelColor,
       int dataColorMax, int modelColorMax) {
-      return shared_ptr<Distribution>(
+      return boost::shared_ptr<Distribution>(
         new Distribution(dataParts, modelParts, dataColor, modelColor,
-                         dataColorMax, modelColorMax));
+          dataColorMax, modelColorMax));
+    }
+
+    boost::shared_ptr<Distribution> create_distrib(int dataParts, int modelParts) {
+      int node_id = get_node_id();
+      int num_nodes = get_group_size();
+      int modelColor = node_id / modelParts;
+      int dataColor = node_id % (num_nodes / dataParts);
+      return create_distrib(dataParts, modelParts, dataColor, modelColor);
+    }
+
+    boost::shared_ptr<Distribution> create_distrib() {
+      return create_distrib(get_group_size(), 1);
     }
 
     Distribution * get_distrib(int dataParts, int modelParts) {
       boost::mutex::scoped_lock l(distrib_lock);
       std::pair<int,int> key = std::make_pair(dataParts, modelParts);
       if (distrib_map->find(key) == distrib_map->end()) {
-        int node_id = get_node_id();
-        int num_nodes = get_nodes_count();
-        int modelColor = node_id / modelParts;
-        int dataColor = node_id % (num_nodes / dataParts);
-        (*distrib_map)[key] = boost::shared_ptr<Distribution>(
-          new Distribution(dataParts, modelParts, dataColor, modelColor));
+        (*distrib_map)[key] = create_distrib(dataParts, modelParts);
       }
       return (*distrib_map)[key].get();
     }
 
     Distribution * get_distrib() {
-      return get_distrib(get_nodes_count(), 1);
+      return get_distrib(get_group_size(), 1);
     }
   }
 }
diff --git a/src/caffe/multinode/mn_activation_layer.cpp b/src/caffe/multinode/mn_activation_layer.cpp
index 7b4c7b8ca..45ed667a6 100644
--- a/src/caffe/multinode/mn_activation_layer.cpp
+++ b/src/caffe/multinode/mn_activation_layer.cpp
@@ -239,7 +239,7 @@ void MnActivationLayer<Dtype>::Pack(const Dtype *src, Dtype *dst, int N, int C,
 template <typename Dtype>
 bool MnActivationLayer<Dtype>::Bypass(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  return distrib_in_->get_group_id() > 0 && distrib_out_->get_group_id() > 0;
+  return distrib_in_->get_global_part_id() > 0 && distrib_out_->get_global_part_id() > 0;
 }
 
 #ifdef CPU_ONLY
diff --git a/src/caffe/multinode/multi_solver.cpp b/src/caffe/multinode/multi_solver.cpp
index 59eec8c7c..a4ab3e734 100644
--- a/src/caffe/multinode/multi_solver.cpp
+++ b/src/caffe/multinode/multi_solver.cpp
@@ -151,7 +151,7 @@ Dtype MultiSolver<Dtype>::ForwardBackwardImpl(bool first, bool last) {
         && layers[i]->layerOp->HasParameterSets()) {
       LAYER_TIMING_START();
       for (int j = 0; j < callbacks_.size(); ++j) {
-        callbacks_[j]->on_iter_finished(i);
+        callbacks_[j]->on_backward_finished(i);
       }
       LAYER_TIMING_STOP(startcomm, i);
     }
diff --git a/src/caffe/multinode/multi_sync.cpp b/src/caffe/multinode/multi_sync.cpp
index 448172c7b..1d2b6b1a0 100644
--- a/src/caffe/multinode/multi_sync.cpp
+++ b/src/caffe/multinode/multi_sync.cpp
@@ -46,7 +46,13 @@ MultiSync<Dtype>::MultiSync(shared_ptr<Solver<Dtype> > root_solver)
         : solver(boost::make_shared<MultiSolver<Dtype> >(root_solver)),
           layers(root_solver->net()->layers()),
           net(root_solver->net()),
-          net_params(root_solver->net()->learnable_params()) {
+          net_params(root_solver->net()->learnable_params()),
+          reduce_req_vec(net_params.size(), NULL),
+          irecv_req_vec(net_params.size(), MPI_REQUEST_NULL),
+          broadcast_req_vec(net_params.size(), NULL),
+          irecv_done(net_params.size(), true),
+          broadcast_launched(net_params.size(), true),
+          distrib_bcast(NULL) {
   root_solver->param().set_disabled_update(true);
 
   if (root_solver->iter() == 0)
@@ -57,6 +63,10 @@ MultiSync<Dtype>::MultiSync(shared_ptr<Solver<Dtype> > root_solver)
   param_ids_finished_flags.resize(layers.size());
 #endif
 
+  if (mn::use_param_server() && !mn::is_param_server()) {
+    distrib_bcast = mn::create_distrib();
+  }
+
   for (int layer_id = 0; layer_id < layers.size(); layer_id++) {
     shared_ptr<Layer<Dtype> > layer = layers[layer_id];
 
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 9fda127c6..d97fb9bfc 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -284,16 +284,17 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
             batch_size = layer_param.memory_data_param().batch_size();
         else if (!layer_param.type().compare("WindowData"))
             batch_size = layer_param.window_data_param().batch_size();
-        else if (!layer_param.type().compare("Input"))
+        else if (!layer_param.type().compare("Input")
+            && layer_param.input_param().shape(0).dim().size())
             batch_size = layer_param.input_param().shape(0).dim(0);
 
         if (caffe::TRAIN == param.state().phase()) {
             LOG(WARNING) << "SetMinibatchSize " << batch_size;
             if (global_batch_size < 0) {
-              global_batch_size = batch_size * mn::get_nodes_count();
+              global_batch_size = batch_size * mn::get_group_size();
               mn::train::set_global_minibatch_size(global_batch_size);
             } else {
-              CHECK_EQ(global_batch_size, batch_size * mn::get_nodes_count());
+              CHECK_EQ(global_batch_size, batch_size * mn::get_group_size());
             }
         }
     }
@@ -553,18 +554,18 @@ void Net<Dtype>::CompilationRuleOne(const NetParameter& param,
 
         // If current layer is BatchNorm of MKL2017 engine..
     if (((layer_param->type().compare("BatchNorm") == 0) &&
-       ((layer_param->batch_norm_param().engine() ==
-         BatchNormParameter_Engine_MKL2017)
-       || ((layer_param->batch_norm_param().engine() ==
-           BatchNormParameter_Engine_DEFAULT) &&
-            param.engine().compare("MKL2017") == 0))) ||
+         ((layer_param->batch_norm_param().engine() == BatchNormParameter_Engine_MKL2017) ||
+          ((layer_param->batch_norm_param().engine() == BatchNormParameter_Engine_DEFAULT) &&
+           (layer_param->has_engine() == false)  &&
+           (param.engine().compare("MKL2017") == 0)) ||
+          (param.engine() == "" && layer_param->engine().compare("MKL2017") == 0))) ||
         // If current layer is BatchNorm of MKLDNN engine..
         ((layer_param->type().compare("BatchNorm") == 0) &&
-         ((layer_param->batch_norm_param().engine() == BatchNormParameter_Engine_MKLDNN)
-          || (((layer_param->batch_norm_param().engine() == BatchNormParameter_Engine_DEFAULT) &&
-               (param.engine().compare(0, 6, "MKLDNN") == 0)) ||
-              (param.engine() == "" &&
-               layer_param->engine().compare(0, 6, "MKLDNN") == 0))))) {
+         ((layer_param->batch_norm_param().engine() == BatchNormParameter_Engine_MKLDNN) ||
+          ((layer_param->batch_norm_param().engine() == BatchNormParameter_Engine_DEFAULT) &&
+           (layer_param->has_engine() == false)  &&
+           (param.engine().compare("MKLDNN") == 0)) ||
+          (param.engine() == "" && layer_param->engine().compare("MKLDNN") == 0)))) {
       std::vector<const LayerParameter*> consumer_layer_params;
       GetBlobConsumers(consumer_layer_params,
                        layer_param->top(0),
@@ -721,10 +722,10 @@ void Net<Dtype>::CompilationRuleTwo(const NetParameter& param,
 
 template <typename Dtype>
 void Net<Dtype>::CompilationRuleThree(const NetParameter& param,
-                             NetParameter* param_compiled) {
+                                      NetParameter* param_compiled) {
   for (int i = 0; i < param.layer_size(); ++i) {
     LayerParameter* layer_param =
-          (const_cast<NetParameter&>(param)).mutable_layer(i);
+        (const_cast<NetParameter&>(param)).mutable_layer(i);
 
     // Optimization rule 3:
     // - If we are having engine MKL2017 and Batch Normalization
@@ -734,25 +735,22 @@ void Net<Dtype>::CompilationRuleThree(const NetParameter& param,
 
     // If current layer is BatchNorm of MKL2017 engine..
     if (((layer_param->type().compare("BatchNorm") == 0) &&
-        ((layer_param->batch_norm_param().engine() ==
-         BatchNormParameter_Engine_MKL2017 || layer_param->batch_norm_param().engine() ==
-         BatchNormParameter_Engine_MKLDNN)
-        || ((layer_param->batch_norm_param().engine() ==
-           BatchNormParameter_Engine_DEFAULT) &&
-            (param.engine().compare("MKL2017") == 0 || param.engine().compare("MKLDNN") == 0)))) &&
+         (layer_param->batch_norm_param().engine() ==
+              BatchNormParameter_Engine_MKL2017 ||
+          ((layer_param->batch_norm_param().engine() ==
+            BatchNormParameter_Engine_DEFAULT) &&
+           param.engine().compare("MKL2017") == 0))) &&
         (layer_param->top(0) == layer_param->bottom(0))) {
       std::string& batch_norm_top = const_cast<string&>(layer_param->top(0));
       std::vector<const LayerParameter*> consumer_layer_params;
-      GetBlobConsumers(consumer_layer_params,
-                       batch_norm_top,
-                       param,
-                       i+1 < param.layer_size() ? i+1 : i);
+      GetBlobConsumers(consumer_layer_params, batch_norm_top, param,
+                       i + 1 < param.layer_size() ? i + 1 : i);
 
       for (std::vector<const LayerParameter*>::iterator it =
-        consumer_layer_params.begin();
-        it != consumer_layer_params.end(); ++it) {
+               consumer_layer_params.begin();
+           it != consumer_layer_params.end(); ++it) {
         // If consumer is computing inplace then modify top as well
-        if (((*it)->top_size() > 0 ) &&
+        if (((*it)->top_size() > 0) &&
             ((*it)->bottom(0).compare((*it)->top(0)) == 0)) {
           // Modify consumer top
           const_cast<string&>((*it)->top(0)).append("_x");
@@ -770,8 +768,62 @@ void Net<Dtype>::CompilationRuleThree(const NetParameter& param,
       // Modify top so it is diffrent from bottom
       batch_norm_top.append("_x");
     }
+
     param_compiled->add_layer()->CopyFrom(*layer_param);
   }
+
+  if(param.state().phase() == TEST) return;
+
+  //Keep the mapping of the inplace blob's name and the layer's index
+  //E.g if the xth layer's has in-place blob, we keep the blob's name as the key
+  //while the layer's index as value.
+  std::map<string, int> inplace_blob_name_to_index;
+  //Keep the mapping of the input blob's name and the layer's index.
+  //e.g, save the Eltwise's bottom blob's name as the key while keep the eltwise's
+  //layer index as the value.
+  std::map<string, int> specified_layer_blob_name_to_index;
+  //Keep paired bottom-top layers which need to modify blob's postfix
+  //eg. the BN is bottom layer while the eltwise is a top layer.
+  vector<vector<const LayerParameter*>> layer_pairs;
+  //Keep the input blob's name of which layer raised non-inplace, e.g Eltwise
+  vector<vector<string>> specified_layer_input_blob_names;
+
+  vector<string> raise_non_inplace_layer_type_list;
+
+  // we may add other layers later, Eltwise calls shareDiff() which will raise
+  // in-place issue, so we add it into the list.
+  raise_non_inplace_layer_type_list.push_back("Eltwise");
+
+
+  for (auto layer_type : raise_non_inplace_layer_type_list) {
+    specified_layer_input_blob_names.clear();
+    inplace_blob_name_to_index.clear();
+    layer_pairs.clear();
+
+    ParseNetInplaceStatus(
+        inplace_blob_name_to_index, specified_layer_blob_name_to_index,
+        specified_layer_input_blob_names, param_compiled, layer_type);
+
+    for (auto each_blob_list : specified_layer_input_blob_names) {
+      GetNeedToCancelInplaceLayers(
+          layer_pairs, specified_layer_blob_name_to_index,
+          inplace_blob_name_to_index, each_blob_list, *param_compiled);
+
+      for (auto each_layer_pair : layer_pairs) {
+        std::string& layer_top =
+            const_cast<string&>((each_layer_pair[0])->top(0));
+
+        for (unsigned int i = 0; i < each_layer_pair[1]->bottom_size(); ++i) {
+          if (each_layer_pair[1]->bottom(i).compare(layer_top) == 0) {
+            const_cast<string&>(each_layer_pair[1]->bottom(i)).append("_x");
+          }
+        }
+
+        const_cast<string&>((each_layer_pair[0])->top(0)).append("_x");
+      }
+    }
+  }
+
   return;
 }
 
@@ -797,6 +849,67 @@ void Net<Dtype>::GetBlobConsumers(
   }
 }
 
+template <typename Dtype>
+void Net<Dtype>::ParseNetInplaceStatus(
+    std::map<string, int>& inplace_blob_name_to_index,
+    std::map<string, int>& specified_layer_blob_name_to_index,
+    vector<vector<string>>& specified_layer_input_blob_names,
+    NetParameter* param, const string& specified_layer_type) {
+  for (int layer_index = 0; layer_index < param->layer_size(); ++layer_index) {
+    LayerParameter* layer_param =
+        (const_cast<NetParameter&>(*param)).mutable_layer(layer_index);
+
+    if (!specified_layer_type.empty() &&
+        layer_param->type().compare(specified_layer_type) != 0 &&
+        layer_param->bottom_size() == 1 && layer_param->top_size() == 1 &&
+        layer_param->bottom(0) == layer_param->top(0)) {
+      inplace_blob_name_to_index[layer_param->bottom(0)] = layer_index;
+    }
+
+    if (!specified_layer_type.empty() &&
+        layer_param->type().compare(specified_layer_type) == 0) {
+      vector<string> blob_names;
+      for (unsigned int blob_index = 0; blob_index < layer_param->bottom_size();
+           blob_index++) {
+        specified_layer_blob_name_to_index[layer_param->bottom(blob_index)] =
+            layer_index;
+        blob_names.push_back(layer_param->bottom(blob_index));
+      }
+      specified_layer_input_blob_names.push_back(blob_names);
+    }
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::GetNeedToCancelInplaceLayers(
+    vector<vector<const LayerParameter*>>& layer_pairs,
+    std::map<string, int>& specified_layer_blob_name_to_index,
+    std::map<string, int>& inplace_blob_name_to_index,
+    vector<string>& each_blob_list, const NetParameter& param) {
+  if (param.engine().compare("MKLDNN") != 0 || each_blob_list.size() == 1)
+    return;
+  
+  layer_pairs.clear();
+  
+  vector<const LayerParameter*> each_layer_pair;
+
+  each_blob_list.erase(each_blob_list.begin());
+
+  for (auto blob_name : each_blob_list) {
+    each_layer_pair.clear();
+    LayerParameter* bottom_layer =
+        (const_cast<NetParameter&>(param))
+            .mutable_layer(inplace_blob_name_to_index[blob_name]);
+    LayerParameter* top_layer =
+        (const_cast<NetParameter&>(param))
+            .mutable_layer(specified_layer_blob_name_to_index[blob_name]);
+    each_layer_pair.push_back(bottom_layer);
+    each_layer_pair.push_back(top_layer);
+
+    layer_pairs.push_back(each_layer_pair);
+  }
+}
+
 template <typename Dtype>
 bool Net<Dtype>::StateMeetsRule(const NetState& state,
     const NetStateRule& rule, const string& layer_name) {
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
old mode 100644
new mode 100755
index cd6cb761f..1d4608dec
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -484,7 +484,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 152 (last added: mn_activation_param)
+// LayerParameter next available layer-specific ID: 156 (last added: spatial_dropout_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -540,6 +540,7 @@ message LayerParameter {
   optional AnnotatedDataParameter annotated_data_param = 200;
   optional ArgMaxParameter argmax_param = 103;
   optional BatchNormParameter batch_norm_param = 139;
+  optional BoxAnnotatorOHEMParameter box_annotator_ohem_param = 152;
   optional BiasParameter bias_param = 141;
   optional ConcatParameter concat_param = 104;
   optional ContrastiveLossParameter contrastive_loss_param = 105;
@@ -573,14 +574,17 @@ message LayerParameter {
   optional PoolingParameter pooling_param = 121;
   optional PowerParameter power_param = 122;
   optional PReLUParameter prelu_param = 131;
+  optional PSROIPoolingParameter psroi_pooling_param = 153;
   optional PriorBoxParameter prior_box_param = 203;
   optional PythonParameter python_param = 130;
   optional RecurrentParameter recurrent_param = 146;
   optional ReductionParameter reduction_param = 136;
   optional ReLUParameter relu_param = 123;
   optional ReshapeParameter reshape_param = 133;
+  optional ROIPoolingParameter roi_pooling_param = 154;
   optional ScaleParameter scale_param = 142;
   optional SigmoidParameter sigmoid_param = 124;
+  optional SmoothL1LossParameter smooth_l1_loss_param = 148;
   optional SoftmaxParameter softmax_param = 125;
   optional SPPParameter spp_param = 132;
   optional SplitParameter split_param = 147;
@@ -590,7 +594,7 @@ message LayerParameter {
   optional TileParameter tile_param = 138;
   optional VideoDataParameter video_data_param = 207;
   optional WindowDataParameter window_data_param = 129;
-
+  optional SpatialDropoutParameter spatial_dropout_param = 155;
   optional string engine = 149 [default = ""];
   
   optional MultinodeLayerParameter multinode = 150;
@@ -810,8 +814,10 @@ message LossParameter {
     VALID = 1;
     // Divide by the batch size.
     BATCH_SIZE = 2;
+    // Divide by pre-fixed normalizer
+    PRE_FIXED = 3;
     // Do not normalize the loss.
-    NONE = 3;
+    NONE = 4;
   }
   // For historical reasons, the default normalization for
   // SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID.
@@ -820,6 +826,8 @@ message LossParameter {
   // is not specified, then setting this to false will be equivalent to
   // normalization = BATCH_SIZE to be consistent with previous behavior.
   optional bool normalize = 2;
+  // pre-fixed normalizer
+  optional float pre_fixed_normalizer = 4 [default = 1];
 }
 
 // Messages that store parameters used by individual layer types follow, in
@@ -917,6 +925,11 @@ message SplitParameter {
   optional Engine engine = 1 [default = DEFAULT];
 }
 
+message BoxAnnotatorOHEMParameter {
+  required uint32 roi_per_img = 1; // number of rois for training
+  optional int32 ignore_label = 2 [default = -1]; // ignore_label in scoring
+}
+
 message BiasParameter {
   // The first axis of bottom[0] (the first input Blob) along which to apply
   // bottom[1] (the second input Blob).  May be negative to index from the end
@@ -1167,6 +1180,10 @@ message DropoutParameter {
   optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
 }
 
+message SpatialDropoutParameter {
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+}
+
 // DummyDataLayer fills any number of arbitrarily shaped blobs with random
 // (or constant) data generated by "Fillers" (see "message FillerParameter").
 message DummyDataParameter {
@@ -1499,16 +1516,19 @@ message PoolingParameter {
   optional bool avg_include_pad = 13 [default = true];
   optional PoolMethod pool = 1 [default = MAX]; // The pooling method
   // Pad, kernel size, and stride are all given as a single value for equal
-  // dimensions in height and width or as Y, X pairs.
-  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
-  optional uint32 pad_h = 9 [default = 0]; // The padding height
-  optional uint32 pad_w = 10 [default = 0]; // The padding width
-  optional uint32 kernel_size = 2; // The kernel size (square)
-  optional uint32 kernel_h = 5; // The kernel height
-  optional uint32 kernel_w = 6; // The kernel width
-  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
-  optional uint32 stride_h = 7; // The stride height
-  optional uint32 stride_w = 8; // The stride width
+  // dimensions in all spatial dimensions, or once per spatial dimension.
+  repeated uint32 pad = 2; // The padding size defaults to 0
+  repeated uint32 kernel_size = 3; // The kernel size (square)
+  repeated uint32 stride = 4; // defaults to 1
+
+  // For 2D pooling only, the *_h and *_w versions may also be used to
+  // specify both spatial dimensions.
+  optional uint32 pad_h = 5 [default = 0]; // The padding height (2D only)
+  optional uint32 pad_w = 6 [default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 7; // The kernel height (2D only)
+  optional uint32 kernel_w = 8; // The kernel width (2D only)
+  optional uint32 stride_h = 9 [default = 1]; // The stride height (2D only)
+  optional uint32 stride_w = 10 [default = 1]; // The stride width (2D only)
   enum Engine {
     DEFAULT = 0;
     CAFFE = 1;
@@ -1520,6 +1540,10 @@ message PoolingParameter {
   // If global_pooling then it will pool over the size of the bottom by doing
   // kernel_h = bottom->height and kernel_w = bottom->width
   optional bool global_pooling = 12 [default = false];
+  // The axis to interpret as "channels" when performing pooling.
+  // Preceding dimensions are treated as independent inputs;
+  // succeeding dimensions are treated as "spatial".
+  optional int32 axis = 14 [default = 1];
 }
 
 message PowerParameter {
@@ -1571,6 +1595,12 @@ message PriorBoxParameter {
   optional float offset = 13 [default = 0.5];
 }
 
+message PSROIPoolingParameter {
+   required float spatial_scale = 1; 
+   required int32 output_dim = 2; // output channel number
+   required int32 group_size = 3; // number of groups to encode position-sensitive score maps
+ }
+
 message PythonParameter {
   optional string module = 1;
   optional string layer = 2;
@@ -1716,6 +1746,18 @@ message ReshapeParameter {
   optional int32 num_axes = 3 [default = -1];
 }
 
+// Message that stores parameters used by ROIPoolingLayer
+message ROIPoolingParameter {
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pooled_h = 1 [default = 0]; // The pooled output height
+  optional uint32 pooled_w = 2 [default = 0]; // The pooled output width
+  // Multiplicative spatial scale factor to translate ROI coords from their
+  // input scale to the scale used when pooling
+  optional float spatial_scale = 3 [default = 1];
+  repeated uint32 pooled_size = 4; // pooled size for 3D blobs
+}
+
 message ScaleParameter {
   // The first axis of bottom[0] (the first input Blob) along which to apply
   // bottom[1] (the second input Blob).  May be negative to index from the end
@@ -1773,6 +1815,13 @@ message SliceParameter {
   optional uint32 slice_dim = 1 [default = 1];
 }
 
+message SmoothL1LossParameter {
+  // SmoothL1Loss(x) =
+  //   0.5 * (sigma * x) ** 2    -- if x < 1.0 / sigma / sigma
+  //   |x| - 0.5 / sigma / sigma -- otherwise
+  optional float sigma = 1 [default = 1];
+}
+
 // Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
 message SoftmaxParameter {
   enum Engine {
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index f7e7ac1cd..a0908bbf4 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -688,7 +688,7 @@ void Solver<Dtype>::TestClassification(const int test_net_id) {
   if (param_.test_compute_loss()) {
 #ifdef USE_MLSL
     mn::allreduce(&loss, 1);
-    loss /= (param_.test_iter(test_net_id) * mn::get_nodes_count());
+    loss /= (param_.test_iter(test_net_id) * mn::get_group_size());
     if (mn::get_node_id() == 0) {
       LOG(INFO) << "Test loss: " << loss;
     }
@@ -705,11 +705,16 @@ void Solver<Dtype>::TestClassification(const int test_net_id) {
     const int output_blob_index =
         test_net->output_blob_indices()[test_score_output_id[i]];
     const string& output_name = test_net->blob_names()[output_blob_index];
-    const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index];
+    const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]
+#ifdef USE_MLSL
+      * mn::get_distrib()->get_data_parts()
+#endif
+      ;
+
     ostringstream loss_msg_stream;
 #ifdef USE_MLSL
     const Dtype mean_score =
-      test_score[i] / (param_.test_iter(test_net_id) * mn::get_nodes_count());
+      test_score[i] / (param_.test_iter(test_net_id) * mn::get_group_size());
 #else /* !USE_MLSL */
     const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id);
 #endif /* USE_MLSL */
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 5347dcdf7..dcf29e789 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -210,7 +210,7 @@ void SGDSolver<Dtype>::ApplyUpdate(int param_id) {
   }
 
 #ifdef ENABLE_SGD_FUSION
-  if (Caffe::mode() == Caffe::CPU) 
+  if ((Caffe::mode() == Caffe::CPU) && (this->type() == string("SGD")))
   {
     //VLOG(1) << "Use Normalize_Regularize_ComputeUpdateValue_Update_Fusion for SGD";
     //LOG(INFO) << "Use Normalize_Regularize_ComputeUpdateValue_Update_Fusion for SGD";
@@ -251,16 +251,14 @@ template <>
 void axpy_axpby_copy<float>(size_t count, const float decay, const float* net_params_data, float *net_params_diff,
                             const float rate, const float momentum, float* history_data)
 {
-  float temp_result = 0.;
 #ifdef _OPENMP
 //#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
 #pragma omp parallel for schedule(static)
 #pragma simd
 #endif  
   for (size_t i = 0; i < count; ++i) {
-    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
-    history_data[i] = temp_result;
-    net_params_diff[i] = temp_result;
+    history_data[i] = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    net_params_diff[i] = history_data[i];
   }
 }
 
@@ -268,16 +266,14 @@ template <>
 void axpy_axpby_copy<double>(size_t count, const double decay, const double* net_params_data, double *net_params_diff,
                              const double rate, const double momentum, double* history_data)
 {
-  double temp_result = 0.;
 #ifdef _OPENMP
 //#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
 #pragma omp parallel for schedule(static)
 #pragma simd
 #endif  
   for (size_t i = 0; i < count; ++i) {
-    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
-    history_data[i] = temp_result;
-    net_params_diff[i] = temp_result;
+    history_data[i] = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    net_params_diff[i] = history_data[i];
   }
 }
 //End: For L1 Regularize_ComputeUpdateValue_Fusion
@@ -292,17 +288,15 @@ template <>
 void axpy_axpby_copy_axpy<float>(size_t count, const float decay, float* net_params_data, float *net_params_diff,
                             const float rate, const float momentum, float* history_data, const float update_param)
 {
-  float temp_result = 0.;
 #ifdef _OPENMP
 //#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
 #pragma omp parallel for schedule(static)
 #pragma simd
 #endif  
   for (size_t i = 0; i < count; ++i) {
-    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
-    history_data[i] =  temp_result;
-    net_params_diff[i] = temp_result;
-    net_params_data[i] = update_param * temp_result + net_params_data[i];
+    history_data[i] = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    net_params_diff[i] = history_data[i];
+    net_params_data[i] = update_param * net_params_diff[i] + net_params_data[i];
   }
 }
 
@@ -310,16 +304,15 @@ template <>
 void axpy_axpby_copy_axpy<double>(size_t count, const double decay, double* net_params_data, double *net_params_diff,
                              const double rate, const double momentum, double* history_data, const double update_param)
 {
-  double temp_result = 0.;
 #ifdef _OPENMP
 //#pragma omp parallel for simd schedule(static)  //Not work for GCC 4.8
 #pragma omp parallel for schedule(static)
 #pragma simd
 #endif  
   for (size_t i = 0; i < count; ++i) {
-    temp_result = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
-    net_params_diff[i] = temp_result;
-    net_params_data[i] = update_param * temp_result + net_params_data[i];
+    history_data[i] = rate * (decay * net_params_data[i] + net_params_diff[i]) + momentum * history_data[i];
+    net_params_diff[i] = history_data[i];
+    net_params_data[i] = update_param * net_params_diff[i] + net_params_data[i];
   }
 }
 //End: For L2 Regularize_ComputeUpdateValue_Update_Fusion
@@ -472,7 +465,6 @@ void SGDSolver<Dtype>::Normalize(int param_id) {
   //LOG(INFO) << "Normalize stage: Normalize stage is not skipped.";
   // Scale gradient to counterbalance accumulation.
   const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
-  
   const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
 
   switch (Caffe::mode()) {
diff --git a/src/caffe/test/test_bbox_util.cpp b/src/caffe/test/test_bbox_util.cpp
index 83d20d944..02782f128 100644
--- a/src/caffe/test/test_bbox_util.cpp
+++ b/src/caffe/test/test_bbox_util.cpp
@@ -303,18 +303,27 @@ TEST_F(CPUBBoxUtilTest, TestOutputBBox) {
   resize_param.set_resize_mode(
       ResizeParameter_Resize_mode_FIT_LARGE_SIZE_AND_PAD);
   OutputBBox(bbox, img_size, has_resize, resize_param, &out_bbox);
-  CHECK_EQ(out_bbox.xmin(), 0.);
-  CHECK_EQ(out_bbox.ymin(), 50.);
-  CHECK_EQ(out_bbox.xmax(), 150.);
-  CHECK_EQ(out_bbox.ymax(), 150.);
+  //CHECK_EQ(out_bbox.xmin(), 0.);
+  //CHECK_EQ(out_bbox.ymin(), 50.);   //When change from GCC to ICC, it will lead the error out of threshold
+  //CHECK_EQ(out_bbox.xmax(), 150.);
+  //CHECK_EQ(out_bbox.ymax(), 150.);
+  static const float errorBound = 1e-4;
+  EXPECT_NEAR(out_bbox.xmin(), 0., errorBound);
+  EXPECT_NEAR(out_bbox.ymin(), 50., errorBound);
+  EXPECT_NEAR(out_bbox.xmax(), 150., errorBound);
+  EXPECT_NEAR(out_bbox.ymax(), 150., errorBound);
 
   img_size.first = 500;
   img_size.second = 300;
   OutputBBox(bbox, img_size, has_resize, resize_param, &out_bbox);
-  CHECK_EQ(out_bbox.xmin(), 0.);
-  CHECK_EQ(out_bbox.ymin(), 150.);
-  CHECK_EQ(out_bbox.xmax(), 50.);
-  CHECK_EQ(out_bbox.ymax(), 250.);
+  //CHECK_EQ(out_bbox.xmin(), 0.);
+  //CHECK_EQ(out_bbox.ymin(), 150.);
+  //CHECK_EQ(out_bbox.xmax(), 50.);   //When change from GCC to ICC, it will lead the error out of threshold
+  //CHECK_EQ(out_bbox.ymax(), 250.);
+  EXPECT_NEAR(out_bbox.xmin(), 0., errorBound);
+  EXPECT_NEAR(out_bbox.ymin(), 150., errorBound);
+  EXPECT_NEAR(out_bbox.xmax(), 50., errorBound);
+  EXPECT_NEAR(out_bbox.ymax(), 250., errorBound);
 }
 
 TEST_F(CPUBBoxUtilTest, TestJaccardOverlap) {
diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
index 40895e0bd..261b3228f 100644
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@@ -55,6 +55,10 @@ using caffe::CAFFE_TEST_CUDA_PROP;
 int main(int argc, char** argv) {
   ::testing::InitGoogleMock(&argc, argv);
   caffe::GlobalInit(&argc, &argv);
+#ifdef USE_MLSL
+  caffe::mn::init(&argc, &argv);
+#endif
+
 #ifndef CPU_ONLY
   // Before starting testing, let's first print out a few cuda defice info.
   int device;
diff --git a/src/caffe/test/test_deconvolution3d_layer.cpp b/src/caffe/test/test_deconvolution3d_layer.cpp
new file mode 100755
index 000000000..110cdf2cc
--- /dev/null
+++ b/src/caffe/test/test_deconvolution3d_layer.cpp
@@ -0,0 +1,440 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <vector>
+#include <iostream>
+
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/deconv_layer.hpp"
+
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+namespace caffe {
+
+// Since ConvolutionLayerTest checks the shared conv/deconv code in detail,
+// we'll just do a simple forward test and a gradient check.
+int initial_list[]={2,3,6,4,6};
+vector<int> initial_vector(initial_list,initial_list+5);
+template <typename TypeParam>
+class DeconvolutionLayerTest3d : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  DeconvolutionLayerTest3d()
+      : blob_bottom_(new Blob<Dtype>(initial_vector)),
+        blob_bottom_2_(new Blob<Dtype>(initial_vector)),
+        blob_top_(new Blob<Dtype>()),
+        blob_top_2_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_value(1.);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_bottom_2_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+
+  virtual ~DeconvolutionLayerTest3d() {
+    delete blob_bottom_;
+    delete blob_bottom_2_;
+    delete blob_top_;
+    delete blob_top_2_;
+  }
+
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_2_;
+  Blob<Dtype>* const blob_top_;
+  Blob<Dtype>* const blob_top_2_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(DeconvolutionLayerTest3d, TestDtypesAndDevices);
+
+TYPED_TEST(DeconvolutionLayerTest3d, TestSetup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(4);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  shared_ptr<Layer<Dtype> > layer(
+      new DeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  
+  int iarray[] = {2,4,13,9,13};
+  vector<int> desired_shape(iarray, iarray+5);
+  for(size_t i=0; i<desired_shape.size();i++){
+    EXPECT_EQ(this->blob_top_->shape()[i], desired_shape[i]);
+    EXPECT_EQ(this->blob_top_2_->shape()[i], desired_shape[i]);
+  }
+
+  // setting group should not change the shape
+  convolution_param->set_num_output(3);
+  convolution_param->set_group(3);
+  layer.reset(new DeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  desired_shape[1]=3;
+  for(size_t i=0; i<desired_shape.size();i++){
+    EXPECT_EQ(this->blob_top_->shape()[i], desired_shape[i]);
+    EXPECT_EQ(this->blob_top_2_->shape()[i], desired_shape[i]);
+  }
+}
+
+TYPED_TEST(DeconvolutionLayerTest3d, TestSimpleDeconvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  float fill_weight = 0.85;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  // test for a set of convolution parameter. kernel_size = 3, stride = 2, pad = 0, output_num = 4
+  // std::cout << "Test for another kernel parameter  kernel_size = 3, stride = 2, pad = 0, output_num = 4"<<std::endl;
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("constant");
+  convolution_param->mutable_weight_filler()->set_value(fill_weight);
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new DeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  // constant-fill the bottom blobs
+  FillerParameter filler_param;
+  filler_param.set_value(1.);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  filler.Fill(this->blob_bottom_2_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // simply check that accumulation works with overlapping filters
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int n = 0; n < this->blob_top_->shape()[0]; ++n) {
+    for (int c = 0; c < this->blob_top_->shape()[1]; ++c) {
+      for (int d = 0; d < this->blob_top_->shape()[2]; ++d) {
+        for (int h = 0; h < this->blob_top_->shape()[3]; ++h) {
+          for (int w = 0; w < this->blob_top_->shape()[4]; ++w) {
+            Dtype expected = 0.1 + 3 * fill_weight;
+            bool d_overlap = d % 2 == 0 && d > 0
+              && d < this->blob_top_->shape()[2] - 1;
+            bool h_overlap = h % 2 == 0 && h > 0
+              && h < this->blob_top_->shape()[3] - 1;
+            bool w_overlap = w % 2 == 0 && w > 0
+              && w < this->blob_top_->shape()[4] - 1;
+          
+            if (d_overlap && h_overlap && w_overlap){
+              expected += 21 * fill_weight;
+            } else if ((d_overlap && h_overlap) || (d_overlap && w_overlap) || (h_overlap && w_overlap)){
+              expected += 9 * fill_weight;
+            } else if (d_overlap || h_overlap || w_overlap){
+              expected += 3 * fill_weight;
+            }
+            int off_list[]={n,c,d,h,w};
+            vector<int> off_set(off_list, off_list + 5);
+            EXPECT_NEAR(top_data[this->blob_top_->offset(off_set)], expected, 1e-4);
+            //std::cout << "self_cal: " << expected << "caffe_cal: " << top_data[this->blob_top_->offset(off_set)] << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  // Test for another kernel parameter  kernel_size = 2, stride = 1, pad = 0, output_num = 3
+  // std::cout << "Test for another kernel parameter  kernel_size = 2, stride = 1, pad = 0, output_num = 3"<<std::endl;
+  convolution_param->clear_kernel_size();
+  convolution_param->add_kernel_size(2);
+  convolution_param->clear_stride();
+  convolution_param->add_stride(1);
+  convolution_param->set_num_output(3);
+  layer.reset(new DeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  // constant-fill the bottom blobs
+  filler_param.set_value(1.);
+  caffe_set(this->blob_bottom_->count(), Dtype(0), this->blob_bottom_->mutable_cpu_data());
+  caffe_set(this->blob_bottom_2_->count(), Dtype(0), this->blob_bottom_2_->mutable_cpu_data());
+  caffe_set(this->blob_top_->count(), Dtype(0), this->blob_top_->mutable_cpu_data());
+  caffe_set(this->blob_top_2_->count(), Dtype(0), this->blob_top_2_->mutable_cpu_data());
+  filler.Fill(this->blob_bottom_);
+  filler.Fill(this->blob_bottom_2_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // simply check that accumulation works with overlapping filters
+  top_data = this->blob_top_->cpu_data();
+  for (int n = 0; n < this->blob_top_->shape()[0]; ++n) {
+    for (int c = 0; c < this->blob_top_->shape()[1]; ++c) {
+      for (int d = 0; d < this->blob_top_->shape()[2]; ++d) {
+        for (int h = 0; h < this->blob_top_->shape()[3]; ++h) {
+          for (int w = 0; w < this->blob_top_->shape()[4]; ++w) {
+            Dtype expected = 0.1 + 3 * fill_weight;
+            bool d_overlap = d > 0 && d < this->blob_top_->shape()[2] - 1;
+            bool h_overlap = h > 0 && h < this->blob_top_->shape()[3] - 1;
+            bool w_overlap = w > 0 && w < this->blob_top_->shape()[4] - 1;
+
+            if (d_overlap && h_overlap && w_overlap){
+              expected += 21 * fill_weight;
+            } else if ((d_overlap && h_overlap) || (d_overlap && w_overlap) || (h_overlap && w_overlap)){
+              expected += 9 * fill_weight;
+            } else if (d_overlap || h_overlap || w_overlap){
+              expected += 3 * fill_weight;
+            }
+            int off_list[]={n,c,d,h,w};
+            vector<int> off_set(off_list, off_list + 5);
+            EXPECT_NEAR(top_data[this->blob_top_->offset(off_set)], expected, 1e-4);
+            // std::cout << "self_cal: " << expected << "caffe_cal: " << top_data[this->blob_top_->offset(off_set)] << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  // Test for another kernel parameter  kernel_size = 3, stride = 1, pad = 1, output_num = 5
+  // std::cout << "Test for another kernel parameter  kernel_size = 3, stride = 1, pad = 1, output_num = 5"<<std::endl;
+  convolution_param->clear_kernel_size();
+  convolution_param->add_kernel_size(3);
+  convolution_param->clear_stride();
+  convolution_param->add_stride(1);
+  convolution_param->clear_pad();
+  convolution_param->add_pad(1);
+  convolution_param->set_num_output(5);
+  layer.reset(new DeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  // constant-fill the bottom blobs
+  filler_param.set_value(1.);
+  caffe_set(this->blob_bottom_->count(), Dtype(0), this->blob_bottom_->mutable_cpu_data());
+  caffe_set(this->blob_bottom_2_->count(), Dtype(0), this->blob_bottom_2_->mutable_cpu_data());
+  caffe_set(this->blob_top_->count(), Dtype(0), this->blob_top_->mutable_cpu_data());
+  caffe_set(this->blob_top_2_->count(), Dtype(0), this->blob_top_2_->mutable_cpu_data());
+  filler.Fill(this->blob_bottom_);
+  filler.Fill(this->blob_bottom_2_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // simply check that accumulation works with overlapping filters
+  top_data = this->blob_top_->cpu_data();
+  for (int n = 0; n < this->blob_top_->shape()[0]; ++n) {
+    for (int c = 0; c < this->blob_top_->shape()[1]; ++c) {
+      for (int d = 0; d < this->blob_top_->shape()[2]; ++d) {
+        for (int h = 0; h < this->blob_top_->shape()[3]; ++h) {
+          for (int w = 0; w < this->blob_top_->shape()[4]; ++w) {
+            Dtype expected = 0.1 + 3 * 8 * fill_weight;
+            bool d_overlap = d > 0 && d < this->blob_top_->shape()[2] - 1;
+            bool h_overlap = h > 0 && h < this->blob_top_->shape()[3] - 1;
+            bool w_overlap = w > 0 && w < this->blob_top_->shape()[4] - 1;
+
+            if (d_overlap && h_overlap && w_overlap){
+              expected += 3 * 19 * fill_weight;
+            } else if ((d_overlap && h_overlap) || (d_overlap && w_overlap) || (h_overlap && w_overlap)){
+              expected += 3 * 10 * fill_weight;
+            } else if (d_overlap || h_overlap || w_overlap){
+              expected += 3 * 4 * fill_weight;
+            }
+            int off_list[]={n,c,d,h,w};
+            vector<int> off_set(off_list, off_list + 5);
+            EXPECT_NEAR(top_data[this->blob_top_->offset(off_set)], expected, 1e-4);
+            // std::cout << "self_cal: " << expected << "caffe_cal: " << top_data[this->blob_top_->offset(off_set)] << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  // Test for another kernel parameter  kernel_size = 2, stride = 2, pad = 0, output_num = 4
+  // std::cout << "Test for another kernel parameter  kernel_size = 2, stride = 2, pad = 0, output_num = 4"<<std::endl;
+  // reset the convolution parameters
+  convolution_param->clear_kernel_size();
+  convolution_param->add_kernel_size(2);
+  convolution_param->clear_stride();
+  convolution_param->add_stride(2);
+  convolution_param->clear_pad();
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_weight_filler()->set_mean(0.0);
+  convolution_param->mutable_weight_filler()->set_std(0.1);
+  // convolution_param->mutable_weight_filler()->set_type("constant");
+  // convolution_param->mutable_weight_filler()->set_value(fill_weight);
+
+  // reset layer
+  layer.reset(new DeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  // constant-fill the bottom blobs
+  filler_param.set_value(1.);
+  caffe_set(this->blob_bottom_->count(), Dtype(0), this->blob_bottom_->mutable_cpu_data());
+  caffe_set(this->blob_bottom_2_->count(), Dtype(0), this->blob_bottom_2_->mutable_cpu_data());
+  caffe_set(this->blob_top_->count(), Dtype(0), this->blob_top_->mutable_cpu_data());
+  caffe_set(this->blob_top_2_->count(), Dtype(0), this->blob_top_2_->mutable_cpu_data());
+  filler.Fill(this->blob_bottom_);
+  filler.Fill(this->blob_bottom_2_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  Dtype* gaussian_weight = layer->blobs()[0]->mutable_cpu_data();
+  EXPECT_EQ(layer->blobs()[0]->count(), 96);
+  // check that deconvolution works as the transport of convolution
+  top_data = this->blob_top_->cpu_data();
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  for (int n = 0; n < this->blob_top_->shape()[0]; ++n) {
+    for (int c = 0; c < this->blob_top_->shape()[1]; ++c) {
+      for (int d = 0; d < this->blob_top_->shape()[2]; ++d) {
+        for (int h = 0; h < this->blob_top_->shape()[3]; ++h) {
+          for (int w = 0; w < this->blob_top_->shape()[4]; ++w) {
+            Dtype expected = 0.1;
+            for (int u = 0; u <= 1; u++) {
+              for (int v = 0; v <= 1; v++) {
+                for (int l = 0; l <= 1; l++) {
+                  bool in_zone = ((d - u) >= 0 && (d - u) / 2 < this->blob_bottom_->shape()[2]) \
+                              && ((h - v) >= 0 && (h - v) / 2 < this->blob_bottom_->shape()[3]) \
+                              && ((w - l) >= 0 && (w - l) / 2 < this->blob_bottom_->shape()[4]);
+                  bool at_pixel = ((d - u) % 2 == 0) && ((h - v) % 2 == 0) && ((w - l) % 2 == 0);
+                  if (in_zone && at_pixel) {
+                    for (int cb = 0; cb < this->blob_bottom_->shape()[1]; cb++) {
+                      int bottom_list[] = {n, cb, (d - u) / 2, (h - v) / 2, (w - l) / 2};
+                      vector<int> bottom_offset(bottom_list, bottom_list + 5);
+                      int weight_offset = (((cb * this->blob_top_->shape()[1] + c) * 2 + u) * 2 + v) * 2 + l;
+                      expected += gaussian_weight[weight_offset] * bottom_data[this->blob_bottom_->offset(bottom_offset)];
+                    }
+                  }
+                }
+              }
+            }
+            int off_list[] = {n, c, d, h, w};
+            vector<int> off_set(off_list, off_list + 5);
+            EXPECT_NEAR(top_data[this->blob_top_->offset(off_set)], expected, 1e-4);
+          }
+        }
+      }
+    }
+  }
+
+  // Test for another kernel parameter  kernel_size = 3, stride = 2, pad = 1, output_num = 4
+  // std::cout << "Test for another kernel parameter  kernel_size = 2, stride = 2, pad = 0, output_num = 4"<<std::endl;
+  // reset the convolution parameters
+  convolution_param->clear_kernel_size();
+  convolution_param->add_kernel_size(3);
+  convolution_param->clear_stride();
+  convolution_param->add_stride(2);
+  convolution_param->clear_pad();
+  convolution_param->add_pad(1);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_weight_filler()->set_mean(0.0);
+  convolution_param->mutable_weight_filler()->set_std(0.1);  
+  // reset layer
+  layer.reset(new DeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  // constant-fill the bottom blobs
+  filler_param.set_value(1.);
+  caffe_set(this->blob_bottom_->count(), Dtype(0), this->blob_bottom_->mutable_cpu_data());
+  caffe_set(this->blob_bottom_2_->count(), Dtype(0), this->blob_bottom_2_->mutable_cpu_data());
+  caffe_set(this->blob_top_->count(), Dtype(0), this->blob_top_->mutable_cpu_data());
+  caffe_set(this->blob_top_2_->count(), Dtype(0), this->blob_top_2_->mutable_cpu_data());
+  filler.Fill(this->blob_bottom_);
+  filler.Fill(this->blob_bottom_2_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  gaussian_weight = layer->blobs()[0]->mutable_cpu_data();
+  EXPECT_EQ(layer->blobs()[0]->count(), 324);
+  // check that deconvolution works as the transport of convolution
+  top_data = this->blob_top_->cpu_data();
+  bottom_data = this->blob_bottom_->cpu_data();
+  for (int n = 0; n < this->blob_top_->shape()[0]; ++n) {
+    for (int c = 0; c < this->blob_top_->shape()[1]; ++c) {
+      for (int d = 0; d < this->blob_top_->shape()[2]; ++d) {
+        for (int h = 0; h < this->blob_top_->shape()[3]; ++h) {
+          for (int w = 0; w < this->blob_top_->shape()[4]; ++w) {
+            Dtype expected = 0.1;
+            for (int u = 0; u <= 2; u++) {
+              for (int v = 0; v <= 2; v++) {
+                for (int l = 0; l <= 2; l++) {
+                  bool in_zone = ((d + 1 - u) >= 0 && (d + 1 - u) / 2 < this->blob_bottom_->shape()[2]) \
+                              && ((h + 1 - v) >= 0 && (h + 1 - v) / 2 < this->blob_bottom_->shape()[3]) \
+                              && ((w + 1 - l) >= 0 && (w + 1 - l) / 2 < this->blob_bottom_->shape()[4]);
+                  bool at_pixel = ((d + 1 - u) % 2 == 0) && ((h + 1 - v) % 2 == 0) && ((w + 1 - l) % 2 == 0);
+                  if (in_zone && at_pixel) {
+                    for (int cb = 0; cb < this->blob_bottom_->shape()[1]; cb++) {
+                      int bottom_list[] = {n, cb, (d + 1 - u) / 2, (h + 1 - v) / 2, (w + 1 - l) / 2};
+                      vector<int> bottom_offset(bottom_list, bottom_list + 5);
+                      int weight_offset = (((cb * this->blob_top_->shape()[1] + c) * 3 + u) * 3 + v) * 3 + l;
+                      expected += gaussian_weight[weight_offset] * bottom_data[this->blob_bottom_->offset(bottom_offset)];
+                    }
+                  }
+                }
+              }
+            }
+            int off_list[] = {n, c, d, h, w};
+            vector<int> off_set(off_list, off_list + 5);
+            EXPECT_NEAR(top_data[this->blob_top_->offset(off_set)], expected, 1e-4);
+          }
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(DeconvolutionLayerTest3d, TestGradient3D) {
+  typedef typename TypeParam::Dtype Dtype;
+  vector<int> bottom_shape(5);
+  bottom_shape[0] = this->blob_bottom_vec_[0]->shape(0);
+  bottom_shape[1] = this->blob_bottom_vec_[0]->shape(1);
+  bottom_shape[2] = 2;
+  bottom_shape[3] = 3;
+  bottom_shape[4] = 2;
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(2);
+  convolution_param->add_stride(2);
+  convolution_param->add_pad(1);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  DeconvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_, this->blob_top_vec_);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_maxpool_dropout_layers.cpp b/src/caffe/test/test_maxpool_dropout_layers.cpp
index 3d9b07a80..ad9d3321d 100644
--- a/src/caffe/test/test_maxpool_dropout_layers.cpp
+++ b/src/caffe/test/test_maxpool_dropout_layers.cpp
@@ -81,8 +81,8 @@ TYPED_TEST(MaxPoolingDropoutTest, TestSetup) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   PoolingLayer<Dtype> max_layer(layer_param);
   max_layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   DropoutLayer<Dtype> dropout_layer(layer_param);
@@ -98,8 +98,8 @@ TYPED_TEST(MaxPoolingDropoutTest, TestForward) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   PoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -128,8 +128,8 @@ TYPED_TEST(MaxPoolingDropoutTest, TestBackward) {
   LayerParameter layer_param;
   layer_param.set_phase(TRAIN);
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   PoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
diff --git a/src/caffe/test/test_mkl_pooling_layer.cpp b/src/caffe/test/test_mkl_pooling_layer.cpp
index 3df1590f1..19347d185 100644
--- a/src/caffe/test/test_mkl_pooling_layer.cpp
+++ b/src/caffe/test/test_mkl_pooling_layer.cpp
@@ -83,7 +83,7 @@ class MKLPoolingLayerTest : public MultiDeviceTest<TypeParam> {
   void TestForwardSquare() {
     LayerParameter layer_param;
     PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-    pooling_param->set_kernel_size(2);
+    pooling_param->add_kernel_size(2);
     pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
     const int num = 2;
     const int channels = 2;
@@ -413,8 +413,8 @@ TYPED_TEST(MKLPoolingLayerTest, TestSetup) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   MKLPoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
@@ -427,9 +427,9 @@ TYPED_TEST(MKLPoolingLayerTest, TestSetupPadded) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(1);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
+  pooling_param->add_pad(1);
   pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
   MKLPoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -502,8 +502,8 @@ TYPED_TEST(MKLPoolingLayerTest, TestGradientMax) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pad(1);
+      pooling_param->add_stride(2);
+      pooling_param->add_pad(1);
       pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
       MKLPoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-4, 1e-2);
@@ -517,9 +517,9 @@ TYPED_TEST(MKLPoolingLayerTest, TestForwardMaxPadded) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
+  pooling_param->add_pad(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
   this->blob_bottom_->Reshape(1, 1, 3, 3);
   // Input:
diff --git a/src/caffe/test/test_mkldnn_pooling_layer.cpp b/src/caffe/test/test_mkldnn_pooling_layer.cpp
index a9d37010a..04cf68bed 100644
--- a/src/caffe/test/test_mkldnn_pooling_layer.cpp
+++ b/src/caffe/test/test_mkldnn_pooling_layer.cpp
@@ -83,7 +83,7 @@ class MKLDNNPoolingLayerTest : public MultiDeviceTest<TypeParam> {
   void TestForwardSquare() {
     LayerParameter layer_param;
     PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-    pooling_param->set_kernel_size(2);
+    pooling_param->add_kernel_size(2);
     pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
     const int num = 2;
     const int channels = 2;
@@ -415,8 +415,8 @@ TYPED_TEST(MKLDNNPoolingLayerTest, TestSetup) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   MKLDNNPoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
@@ -429,9 +429,9 @@ TYPED_TEST(MKLDNNPoolingLayerTest, TestSetupPadded) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(1);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
+  pooling_param->add_pad(1);
   pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
   MKLDNNPoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -506,8 +506,8 @@ TYPED_TEST(MKLDNNPoolingLayerTest, TestGradientMax) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pad(1);
+      pooling_param->add_stride(2);
+      pooling_param->add_pad(1);
       pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
       MKLDNNPoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-4, 1e-2);
@@ -521,9 +521,9 @@ TYPED_TEST(MKLDNNPoolingLayerTest, TestForwardMaxPadded) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
+  pooling_param->add_pad(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
   this->blob_bottom_->Reshape(1, 1, 3, 3);
   // Input:
@@ -590,9 +590,9 @@ TYPED_TEST(MKLDNNPoolingLayerTest, TestForwardAve) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(1);
-  pooling_param->set_pad(1);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(1);
+  pooling_param->add_pad(1);
   pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
   this->blob_bottom_->Reshape(1, 1, 3, 3);
   FillerParameter filler_param;
@@ -653,7 +653,7 @@ TYPED_TEST(MKLDNNPoolingLayerTest, DISABLED_TestGradientAve) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
+      pooling_param->add_stride(2);
       pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
       MKLDNNPoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-2, 1e-2);
@@ -672,8 +672,8 @@ TYPED_TEST(MKLDNNPoolingLayerTest, TestGradientAvePadded) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pad(2);
+      pooling_param->add_stride(2);
+      pooling_param->add_pad(2);
       pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
       MKLDNNPoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-2, 1e-2);
diff --git a/src/caffe/test/test_multibox_loss_layer.cpp b/src/caffe/test/test_multibox_loss_layer.cpp
index 96dd65f25..1b2cb4be6 100644
--- a/src/caffe/test/test_multibox_loss_layer.cpp
+++ b/src/caffe/test/test_multibox_loss_layer.cpp
@@ -235,8 +235,8 @@ class MultiBoxLossLayerTest : public MultiDeviceTest<TypeParam> {
     // Fake layer
     PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
     pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
-    pooling_param->set_kernel_size(10);
-    pooling_param->set_stride(10);
+    pooling_param->add_kernel_size(10);
+    pooling_param->add_stride(10);
 
     PoolingLayer<Dtype> pooling_layer(layer_param);
     Blob<Dtype>* fake_blob = new Blob<Dtype>(num_, 5, height_, width_);
diff --git a/src/caffe/test/test_pooling_layer.cpp b/src/caffe/test/test_pooling_layer.cpp
index 7f76b394b..ab4cc5496 100644
--- a/src/caffe/test/test_pooling_layer.cpp
+++ b/src/caffe/test/test_pooling_layer.cpp
@@ -86,7 +86,7 @@ class PoolingLayerTest : public MultiDeviceTest<TypeParam> {
   void TestForwardSquare() {
     LayerParameter layer_param;
     PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-    pooling_param->set_kernel_size(2);
+    pooling_param->add_kernel_size(2);
     pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
     const int num = 2;
     const int channels = 2;
@@ -414,8 +414,8 @@ TYPED_TEST(PoolingLayerTest, TestSetup) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   PoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
@@ -428,9 +428,9 @@ TYPED_TEST(PoolingLayerTest, TestSetupPadded) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(1);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
+  pooling_param->add_pad(1);
   pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
   PoolingLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -501,8 +501,8 @@ TYPED_TEST(PoolingLayerTest, TestGradientMax) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pad(1);
+      pooling_param->add_stride(2);
+      pooling_param->add_pad(1);
       pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
       PoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-4, 1e-2);
@@ -516,9 +516,9 @@ TYPED_TEST(PoolingLayerTest, TestForwardMaxPadded) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
+  pooling_param->add_pad(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
   this->blob_bottom_->Reshape(1, 1, 3, 3);
   // Input:
@@ -565,7 +565,7 @@ TYPED_TEST(PoolingLayerTest, TestGradientMaxTopMask) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
+      pooling_param->add_stride(2);
       pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
       this->blob_top_vec_.push_back(this->blob_top_mask_);
       PoolingLayer<Dtype> layer(layer_param);
@@ -581,9 +581,9 @@ TYPED_TEST(PoolingLayerTest, TestForwardAve) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(1);
-  pooling_param->set_pad(1);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(1);
+  pooling_param->add_pad(1);
   pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
   this->blob_bottom_->Reshape(1, 1, 3, 3);
   FillerParameter filler_param;
@@ -617,7 +617,7 @@ TYPED_TEST(PoolingLayerTest, TestGradientAve) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
+      pooling_param->add_stride(2);
       pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
       PoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-2, 1e-2);
@@ -635,8 +635,8 @@ TYPED_TEST(PoolingLayerTest, TestGradientAvePadded) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pad(2);
+      pooling_param->add_stride(2);
+      pooling_param->add_pad(2);
       pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
       PoolingLayer<Dtype> layer(layer_param);
       GradientChecker<Dtype> checker(1e-2, 1e-2);
@@ -678,7 +678,7 @@ class CuDNNPoolingLayerTest : public GPUDeviceTest<Dtype> {
   void TestForwardSquare() {
     LayerParameter layer_param;
     PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-    pooling_param->set_kernel_size(2);
+    pooling_param->add_kernel_size(2);
     pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
     const int num = 2;
     const int channels = 2;
@@ -1005,8 +1005,8 @@ TYPED_TEST_CASE(CuDNNPoolingLayerTest, TestDtypes);
 TYPED_TEST(CuDNNPoolingLayerTest, TestSetupCuDNN) {
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   CuDNNPoolingLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
@@ -1018,9 +1018,9 @@ TYPED_TEST(CuDNNPoolingLayerTest, TestSetupCuDNN) {
 TYPED_TEST(CuDNNPoolingLayerTest, TestSetupPaddedCuDNN) {
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(1);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
+  pooling_param->add_pad(1);
   pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
   CuDNNPoolingLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -1080,9 +1080,9 @@ TYPED_TEST(CuDNNPoolingLayerTest, TestGradientMaxCuDNN) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
+      pooling_param->add_stride(2);
       // currenty, cuDNN pooling does not support padding
-      pooling_param->set_pad(0);
+      pooling_param->add_pad(0);
       pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
       CuDNNPoolingLayer<TypeParam> layer(layer_param);
       GradientChecker<TypeParam> checker(1e-4, 1e-2);
@@ -1095,9 +1095,9 @@ TYPED_TEST(CuDNNPoolingLayerTest, TestGradientMaxCuDNN) {
 TYPED_TEST(CuDNNPoolingLayerTest, TestForwardMaxPaddedCuDNN) {
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
-  pooling_param->set_pad(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
+  pooling_param->add_pad(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_MAX);
   this->blob_bottom_->Reshape(1, 1, 3, 3);
   // Input:
@@ -1160,11 +1160,11 @@ TYPED_TEST(CuDNNPoolingLayerTest, TestGradientMaxTopMaskCuDNN) {
 TYPED_TEST(CuDNNPoolingLayerTest, TestForwardAveCuDNN) {
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(1);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(1);
   // Currently, cuDNN pooling does not support padding, so we use
   // a simplified version of this test.
-  pooling_param->set_pad(0);
+  pooling_param->add_pad(0);
   pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
   this->blob_bottom_->Reshape(1, 1, 3, 3);
   FillerParameter filler_param;
@@ -1189,7 +1189,7 @@ TYPED_TEST(CuDNNPoolingLayerTest, TestGradientAveCuDNN) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
+      pooling_param->add_stride(2);
       pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
       CuDNNPoolingLayer<TypeParam> layer(layer_param);
       GradientChecker<TypeParam> checker(1e-2, 1e-2);
@@ -1206,8 +1206,8 @@ TYPED_TEST(CuDNNPoolingLayerTest, TestGradientAvePaddedCuDNN) {
       PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
       pooling_param->set_kernel_h(kernel_h);
       pooling_param->set_kernel_w(kernel_w);
-      pooling_param->set_stride(2);
-      pooling_param->set_pad(2);
+      pooling_param->add_stride(2);
+      pooling_param->add_pad(2);
       pooling_param->set_pool(PoolingParameter_PoolMethod_AVE);
       CuDNNPoolingLayer<TypeParam> layer(layer_param);
       GradientChecker<TypeParam> checker(1e-2, 1e-2);
diff --git a/src/caffe/test/test_roi_pooling_layer.cpp b/src/caffe/test/test_roi_pooling_layer.cpp
new file mode 100644
index 000000000..6be539e8b
--- /dev/null
+++ b/src/caffe/test/test_roi_pooling_layer.cpp
@@ -0,0 +1,199 @@
+/*
+All modification made by Intel Corporation: © 2016 Intel Corporation
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "boost/scoped_ptr.hpp"
+#include "gtest/gtest.h"
+
+#include "caffe/blob.hpp"
+#include "caffe/common.hpp"
+#include "caffe/filler.hpp"
+#include "caffe/layers/roi_pooling_layer.hpp"
+#include "caffe/test/test_caffe_main.hpp"
+#include "caffe/test/test_gradient_check_util.hpp"
+
+using boost::scoped_ptr;
+
+namespace caffe {
+
+template <typename TypeParam>
+class ROIPoolingLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  ROIPoolingLayerTest()
+      : blob_bottom_data_(new Blob<Dtype>(2, 2, 6, 8)),
+        blob_bottom_rois_(new Blob<Dtype>(4, 5, 1, 1)),
+        blob_top_data_(new Blob<Dtype>()),
+        blob_bottom_data_2_(new Blob<Dtype>(2, 3, 12, 20)),
+        blob_bottom_rois_2_(new Blob<Dtype>(1, 5, 1, 1)),
+        blob_top_data_2_(new Blob<Dtype>()) {
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_std(10);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_data_);
+    // for (int i = 0; i < blob_bottom_data_->count(); ++i) {
+    //   blob_bottom_data_->mutable_cpu_data()[i] = i;
+    // }
+    blob_bottom_vec_.push_back(blob_bottom_data_);
+    int i = 0;
+    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 0;
+    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 0;  // x1 < 8
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 0;  // y1 < 6
+    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 7;  // x2 < 8
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 5;  // y2 < 6
+    i = 1;
+    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 1;
+    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 6;  // x1 < 8
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 2;  // y1 < 6
+    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 7;  // x2 < 8
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 5;  // y2 < 6
+    i = 2;
+    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 1;
+    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 3;  // x1 < 8
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 1;  // y1 < 6
+    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 6;  // x2 < 8
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 4;  // y2 < 6
+    i = 3;
+    blob_bottom_rois_->mutable_cpu_data()[0 + 5*i] = 0;
+    blob_bottom_rois_->mutable_cpu_data()[1 + 5*i] = 3;  // x1 < 8
+    blob_bottom_rois_->mutable_cpu_data()[2 + 5*i] = 3;  // y1 < 6
+    blob_bottom_rois_->mutable_cpu_data()[3 + 5*i] = 3;  // x2 < 8
+    blob_bottom_rois_->mutable_cpu_data()[4 + 5*i] = 3;  // y2 < 6
+
+    blob_bottom_vec_.push_back(blob_bottom_rois_);
+    blob_top_vec_.push_back(blob_top_data_);
+
+    filler.Fill(this->blob_bottom_data_2_);
+    blob_bottom_vec_2_.push_back(blob_bottom_data_2_);
+
+    // Pool over the entire bottom of feature map 1
+    blob_bottom_rois_2_->mutable_cpu_data()[0] = 1;
+    blob_bottom_rois_2_->mutable_cpu_data()[1] = 0;
+    blob_bottom_rois_2_->mutable_cpu_data()[2] = 0;
+    blob_bottom_rois_2_->mutable_cpu_data()[3] = 19;
+    blob_bottom_rois_2_->mutable_cpu_data()[4] = 11;
+
+    blob_bottom_vec_2_.push_back(blob_bottom_rois_2_);
+    blob_top_vec_2_.push_back(blob_top_data_2_);
+  }
+  virtual ~ROIPoolingLayerTest() {
+    delete blob_bottom_data_;
+    delete blob_bottom_rois_;
+    delete blob_top_data_;
+    delete blob_bottom_data_2_;
+    delete blob_bottom_rois_2_;
+    delete blob_top_data_2_;
+  }
+  Blob<Dtype>* const blob_bottom_data_;
+  Blob<Dtype>* const blob_bottom_rois_;
+  Blob<Dtype>* const blob_top_data_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+
+  Blob<Dtype>* const blob_bottom_data_2_;
+  Blob<Dtype>* const blob_bottom_rois_2_;
+  Blob<Dtype>* const blob_top_data_2_;
+  vector<Blob<Dtype>*> blob_bottom_vec_2_;
+  vector<Blob<Dtype>*> blob_top_vec_2_;
+};
+
+TYPED_TEST_CASE(ROIPoolingLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(ROIPoolingLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ROIPoolingParameter* roi_pooling_param =
+      layer_param.mutable_roi_pooling_param();
+
+  // 12 x 20 pooling with bin_size_h == 1 && bin_size_w == 1
+  roi_pooling_param->set_pooled_h(12);
+  roi_pooling_param->set_pooled_w(20);
+  ROIPoolingLayer<Dtype> layer_2(layer_param);
+  layer_2.SetUp(this->blob_bottom_vec_2_, this->blob_top_vec_2_);
+  layer_2.Forward(this->blob_bottom_vec_2_, this->blob_top_vec_2_);
+  for (int i = 0; i < this->blob_top_data_2_->count(); ++i) {
+    EXPECT_EQ(this->blob_top_data_2_->cpu_data()[i],
+        this->blob_bottom_data_2_->cpu_data()[i+3*12*20]);
+  }
+
+  // 6 x 10 pooling with bin_size_h == 2 && bin_size_w == 2
+  roi_pooling_param->set_pooled_h(6);
+  roi_pooling_param->set_pooled_w(10);
+  ROIPoolingLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_2_, this->blob_top_vec_2_);
+  layer.Forward(this->blob_bottom_vec_2_, this->blob_top_vec_2_);
+  int n = 1;
+  for (int c = 0; c < 3; ++c) {
+    for (int ph = 0; ph < 6; ++ph) {
+      for (int pw = 0; pw < 10; ++pw) {
+        Dtype maxval = -FLT_MAX;
+        for (int h = 2 * ph; h < 2 * (ph + 1); ++h) {
+          for (int w = 2 * pw; w < 2 * (pw + 1); ++w) {
+            maxval = std::max(maxval, this->blob_bottom_data_2_->cpu_data()[
+                ((n * 3 + c) * 12 + h) * 20 + w]);
+          }
+        }
+        EXPECT_EQ(this->blob_top_data_2_->cpu_data()[(c * 6 + ph) * 10 + pw],
+            maxval);
+      }
+    }
+  }
+}
+
+TYPED_TEST(ROIPoolingLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ROIPoolingParameter* roi_pooling_param =
+      layer_param.mutable_roi_pooling_param();
+  roi_pooling_param->set_pooled_h(3);
+  roi_pooling_param->set_pooled_w(4);
+  ROIPoolingLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-4, 1e-2);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 0);
+}
+
+}  // namespace caffe
diff --git a/src/caffe/test/test_smooth_L1_loss_layer.cpp b/src/caffe/test/test_smooth_L1_loss_layer.cpp
index 6c5c41800..d000580d2 100644
--- a/src/caffe/test/test_smooth_L1_loss_layer.cpp
+++ b/src/caffe/test/test_smooth_L1_loss_layer.cpp
@@ -36,6 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include <cmath>
+#include <cstdlib>
+#include <cstring>
 #include <vector>
 
 #include "gtest/gtest.h"
@@ -44,7 +46,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "caffe/common.hpp"
 #include "caffe/filler.hpp"
 #include "caffe/layers/smooth_L1_loss_layer.hpp"
-
 #include "caffe/test/test_caffe_main.hpp"
 #include "caffe/test/test_gradient_check_util.hpp"
 
@@ -52,54 +53,48 @@ namespace caffe {
 
 template <typename TypeParam>
 class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
-  typedef typename TypeParam::Dtype Dtype;
+ typedef typename TypeParam::Dtype Dtype;
 
  protected:
   SmoothL1LossLayerTest()
       : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
         blob_bottom_label_(new Blob<Dtype>(10, 5, 1, 1)),
+        blob_bottom_inside_weights_(new Blob<Dtype>(10, 5, 1, 1)),
+        blob_bottom_outside_weights_(new Blob<Dtype>(10, 5, 1, 1)),
         blob_top_loss_(new Blob<Dtype>()) {
     // fill the values
+    FillerParameter const_filler_param;
+    const_filler_param.set_value(-1.);
+    ConstantFiller<Dtype> const_filler(const_filler_param);
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
+
     filler.Fill(this->blob_bottom_data_);
     blob_bottom_vec_.push_back(blob_bottom_data_);
+
     filler.Fill(this->blob_bottom_label_);
     blob_bottom_vec_.push_back(blob_bottom_label_);
+
+    filler.Fill(this->blob_bottom_inside_weights_);
+    blob_bottom_vec_.push_back(blob_bottom_inside_weights_);
+
+    filler.Fill(this->blob_bottom_outside_weights_);
+    blob_bottom_vec_.push_back(blob_bottom_outside_weights_);
+
     blob_top_vec_.push_back(blob_top_loss_);
   }
   virtual ~SmoothL1LossLayerTest() {
     delete blob_bottom_data_;
     delete blob_bottom_label_;
+    delete blob_bottom_inside_weights_;
+    delete blob_bottom_outside_weights_;
     delete blob_top_loss_;
   }
 
-  void TestForward() {
-    // Get the loss without a specified objective weight -- should be
-    // equivalent to explicitly specifiying a weight of 1.
-    LayerParameter layer_param;
-    SmoothL1LossLayer<Dtype> layer_weight_1(layer_param);
-    layer_weight_1.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype loss_weight_1 =
-        layer_weight_1.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-    // Get the loss again with a different objective weight; check that it is
-    // scaled appropriately.
-    const Dtype kLossWeight = 3.7;
-    layer_param.add_loss_weight(kLossWeight);
-    SmoothL1LossLayer<Dtype> layer_weight_2(layer_param);
-    layer_weight_2.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype loss_weight_2 =
-        layer_weight_2.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-    const Dtype kErrorMargin = 1e-5;
-    EXPECT_NEAR(loss_weight_1 * kLossWeight, loss_weight_2, kErrorMargin);
-    // Make sure the loss is non-trivial.
-    const Dtype kNonTrivialAbsThresh = 1e-1;
-    EXPECT_GE(fabs(loss_weight_1), kNonTrivialAbsThresh);
-  }
-
   Blob<Dtype>* const blob_bottom_data_;
   Blob<Dtype>* const blob_bottom_label_;
+  Blob<Dtype>* const blob_bottom_inside_weights_;
+  Blob<Dtype>* const blob_bottom_outside_weights_;
   Blob<Dtype>* const blob_top_loss_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
@@ -107,20 +102,23 @@ class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
 
 TYPED_TEST_CASE(SmoothL1LossLayerTest, TestDtypesAndDevices);
 
-TYPED_TEST(SmoothL1LossLayerTest, TestForward) {
-  this->TestForward();
-}
-
 TYPED_TEST(SmoothL1LossLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
+
   LayerParameter layer_param;
+  SmoothL1LossParameter* loss_param =
+      layer_param.mutable_smooth_l1_loss_param();
+  loss_param->set_sigma(2.4);
+
   const Dtype kLossWeight = 3.7;
   layer_param.add_loss_weight(kLossWeight);
   SmoothL1LossLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
+      this->blob_top_vec_, 0);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_, 1);
 }
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_stochastic_pooling.cpp b/src/caffe/test/test_stochastic_pooling.cpp
index 07b6b8b75..59ea570a9 100644
--- a/src/caffe/test/test_stochastic_pooling.cpp
+++ b/src/caffe/test/test_stochastic_pooling.cpp
@@ -93,8 +93,8 @@ TYPED_TEST_CASE(CPUStochasticPoolingLayerTest, TestDtypes);
 TYPED_TEST(CPUStochasticPoolingLayerTest, TestSetup) {
   LayerParameter layer_param;
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   PoolingLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), this->blob_bottom_->num());
@@ -116,8 +116,8 @@ TYPED_TEST(GPUStochasticPoolingLayerTest, TestStochastic) {
   LayerParameter layer_param;
   layer_param.set_phase(TRAIN);
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_STOCHASTIC);
   PoolingLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -159,8 +159,8 @@ TYPED_TEST(GPUStochasticPoolingLayerTest, TestStochasticTestPhase) {
   LayerParameter layer_param;
   layer_param.set_phase(TEST);
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_STOCHASTIC);
   PoolingLayer<TypeParam> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
@@ -196,8 +196,8 @@ TYPED_TEST(GPUStochasticPoolingLayerTest, TestGradient) {
   LayerParameter layer_param;
   layer_param.set_phase(TRAIN);
   PoolingParameter* pooling_param = layer_param.mutable_pooling_param();
-  pooling_param->set_kernel_size(3);
-  pooling_param->set_stride(2);
+  pooling_param->add_kernel_size(3);
+  pooling_param->add_stride(2);
   pooling_param->set_pool(PoolingParameter_PoolMethod_STOCHASTIC);
   PoolingLayer<TypeParam> layer(layer_param);
   GradientChecker<TypeParam> checker(1e-4, 1e-2);
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index 05a743bc5..b29720276 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -313,7 +313,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
       if (type == "conv") {
         layer_param->mutable_convolution_param()->add_pad(v0_layer_param.pad());
       } else if (type == "pool") {
-        layer_param->mutable_pooling_param()->set_pad(v0_layer_param.pad());
+        layer_param->mutable_pooling_param()->add_pad(v0_layer_param.pad());
       } else {
         LOG(ERROR) << "Unknown parameter pad for layer type " << type;
         is_fully_compatible = false;
@@ -324,7 +324,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
         layer_param->mutable_convolution_param()->add_kernel_size(
             v0_layer_param.kernelsize());
       } else if (type == "pool") {
-        layer_param->mutable_pooling_param()->set_kernel_size(
+        layer_param->mutable_pooling_param()->add_kernel_size(
             v0_layer_param.kernelsize());
       } else {
         LOG(ERROR) << "Unknown parameter kernelsize for layer type " << type;
@@ -345,7 +345,7 @@ bool UpgradeV0LayerParameter(const V1LayerParameter& v0_layer_connection,
         layer_param->mutable_convolution_param()->add_stride(
             v0_layer_param.stride());
       } else if (type == "pool") {
-        layer_param->mutable_pooling_param()->set_stride(
+        layer_param->mutable_pooling_param()->add_stride(
             v0_layer_param.stride());
       } else {
         LOG(ERROR) << "Unknown parameter stride for layer type " << type;
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 5d0ea7f49..0373794c3 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -59,7 +59,9 @@ namespace bp = boost::python;
 #include "caffe/util/bbox_util.hpp"
 
 #ifdef USE_MLSL
+#include "caffe/multinode/mlsl.hpp"
 #include "caffe/multinode/multi_sync.hpp"
+#include "caffe/multinode/async_param_server.hpp"
 #endif /* USE_MLSL */
 
 using caffe::Blob;
@@ -118,6 +120,10 @@ DEFINE_bool(fast_compare, false,
 DEFINE_int32(fast_compare_max, 50,
     "Optional; Max errors for fast_compare");
 DEFINE_double(buffer_filler, std::nanf(""), "Buffer filler for compare tool");
+DEFINE_int32(n_group, 1, "Optional; if given, it specifies how many trees"
+             " we want in the async forest");
+DEFINE_int32(n_server, 0, "Optional; if given, it specifies how many parts"
+             "The model is splited to. I.e. how many process you have for param server");
 
 // A simple registry for caffe commands.
 typedef int (*BrewFunction)();
@@ -315,10 +321,17 @@ int train() {
 
 #ifdef USE_MLSL
   if (caffe::mn::is_multinode()) {
+    MPI_Barrier(MPI_COMM_WORLD);
     LOG(INFO) << "Configuring multinode setup";
-    caffe::MultiSync<float> sync(solver);
-    LOG(INFO) << "Starting Multi-node Optimization in MLSL environment";
-    sync.run();
+    if (!caffe::mn::is_param_server()) {
+      caffe::MultiSync<float> sync(solver);
+      LOG(INFO) << "Starting Multi-node Optimization in MLSL environment";
+      sync.run();
+    } else {
+      caffe::mn::AsyncParamServer<float> aps(solver);
+      LOG(INFO) << "Starting Parameter Server";
+      aps.Run();
+    }
   } else
 #endif /* USE_MLSL */
 
@@ -709,7 +722,20 @@ int main(int argc, char** argv) {
   // Run tool or show usage.
   caffe::GlobalInit(&argc, &argv);
 #ifdef USE_MLSL
+  caffe::mn::nGroup = FLAGS_n_group;
+  caffe::mn::nServer = FLAGS_n_server;
   caffe::mn::init(&argc, &argv);
+  CHECK_EQ(caffe::mn::get_world_size(),
+           caffe::mn::nGroup * caffe::mn::get_group_size() + caffe::mn::nServer);
+  if (caffe::mn::nGroup > 1) {
+    CHECK_GE(caffe::mn::nServer, 1)
+      << "Expect there exists parameter server to support multiple groups";
+  }
+  if (caffe::mn::get_node_rank() == 0) {
+    LOG(INFO) << "Number of groups: " << caffe::mn::nGroup
+              << ", group size: " << caffe::mn::get_group_size()
+              << ", number of parameter servers: " << caffe::mn::nServer;
+  }
 #endif
   if (argc == 2) {
 #ifdef WITH_PYTHON_LAYER
diff --git a/tools/extra/plot_log.gnuplot.example b/tools/extra/plot_log.gnuplot.example
index 748b96e69..02c68e1d2 100644
--- a/tools/extra/plot_log.gnuplot.example
+++ b/tools/extra/plot_log.gnuplot.example
@@ -4,7 +4,7 @@
 # Be warned that the fields in the training log may change in the future.
 # You had better check the data files before designing your own plots.
 
-# Please generate the neccessary data files with 
+# Please generate the necessary data files with 
 # /path/to/caffe/tools/extra/parse_log.sh before plotting.
 # Example usage: 
 #     ./parse_log.sh mnist.log