NVIDIA
diff --git a/‎cmake/Cuda.cmake
+8-11 b/‎cmake/Cuda.cmake
+8-11
diff --git a/‎include/caffe/blob.hpp
+6-12 b/‎include/caffe/blob.hpp
+6-12
diff --git a/‎include/caffe/common.hpp
+9-1 b/‎include/caffe/common.hpp
+9-1
diff --git a/‎include/caffe/data_transformer.hpp
+66 b/‎include/caffe/data_transformer.hpp
+66
diff --git a/‎include/caffe/layer.hpp
+3-13 b/‎include/caffe/layer.hpp
+3-13
diff --git a/‎include/caffe/layers/base_conv_layer.hpp
-1 b/‎include/caffe/layers/base_conv_layer.hpp
-1
diff --git a/‎include/caffe/layers/base_data_layer.hpp
+1 b/‎include/caffe/layers/base_data_layer.hpp
+1
diff --git a/‎include/caffe/layers/cudnn_conv_layer.hpp
+10-1 b/‎include/caffe/layers/cudnn_conv_layer.hpp
+10-1
diff --git a/‎include/caffe/layers/embed_layer.hpp
-1 b/‎include/caffe/layers/embed_layer.hpp
-1
diff --git a/‎include/caffe/layers/inner_product_layer.hpp
-1 b/‎include/caffe/layers/inner_product_layer.hpp
-1
diff --git a/‎include/caffe/layers/scale_layer.hpp
-1 b/‎include/caffe/layers/scale_layer.hpp
-1
diff --git a/‎include/caffe/layers/softmax_loss_layer.hpp
+4-2 b/‎include/caffe/layers/softmax_loss_layer.hpp
+4-2
diff --git a/‎include/caffe/net.hpp
+9-2 b/‎include/caffe/net.hpp
+9-2
@@ -4,7 +4,7 @@ endif()
 
 # Known NVIDIA GPU achitectures Caffe can be compiled for.
 # This list will be used for CUDA_ARCH_NAME = All option
-set(Caffe_known_gpu_archs "20 21(20) 30 35 50 60 61 62")
+set(Caffe_known_gpu_archs "30 35 50 52 60 61 70")
 
 ################################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
@@ -36,8 +36,7 @@ function(caffe_detect_installed_gpus out_variable)
                     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 
     if(__nvcc_res EQUAL 0)
-      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
-      set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from caffe_detect_gpus tool" FORCE)
+      set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architectures from caffe_detect_gpus tool" FORCE)
     endif()
   endif()
 
@@ -56,22 +55,22 @@ endfunction()
 #   caffe_select_nvcc_arch_flags(out_variable)
 function(caffe_select_nvcc_arch_flags out_variable)
   # List of arch names
-  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "All" "Manual")
+  set(__archs_names "Kepler" "Maxwell" "Pascal" "Volta" "All" "Manual")
   set(__archs_name_default "All")
   if(NOT CMAKE_CROSSCOMPILING)
     list(APPEND __archs_names "Auto")
     set(__archs_name_default "Auto")
   endif()
 
   # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
-  set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU architecture.")
   set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} )
   mark_as_advanced(CUDA_ARCH_NAME)
 
   # verify CUDA_ARCH_NAME value
   if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
     string(REPLACE ";" ", " __archs_names "${__archs_names}")
-    message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.")
+    message(FATAL_ERROR "Only ${__archs_names} architecture names are supported.")
   endif()
 
   if(${CUDA_ARCH_NAME} STREQUAL "Manual")
@@ -83,14 +82,12 @@ function(caffe_select_nvcc_arch_flags out_variable)
     unset(CUDA_ARCH_PTX CACHE)
   endif()
 
-  if(${CUDA_ARCH_NAME} STREQUAL "Fermi")
-    set(__cuda_arch_bin "20 21(20)")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
     set(__cuda_arch_bin "30 35")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
-    set(__cuda_arch_bin "50")
+    set(__cuda_arch_bin "50 52")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
-    set(__cuda_arch_bin "60 61 62")
+    set(__cuda_arch_bin "60 61")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
     set(__cuda_arch_bin "70")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
 
@@ -94,24 +94,16 @@ class Blob {
     return diff_tensor_ ? diff_tensor_->type() : last_diff_type_;
   }
 
-  void lock_data() {
-    data_tensor_->lock_tensor();
-  }
-
-  void lock_diff() {
-    diff_tensor_->lock_tensor();
-  }
-
   bool diff_equals(const Blob& other) const {
     return diff_tensor_ == other.diff_tensor_;
   }
 
   void allocate_data(bool on_gpu = true) {
-    data_tensor_->mutable_memory(data_tensor_->type(), on_gpu);
+    data_tensor_->current_memory(on_gpu);
   }
 
   void allocate_diff(bool on_gpu = true) {
-    diff_tensor_->mutable_memory(diff_tensor_->type(), on_gpu);
+    diff_tensor_->current_memory(on_gpu);
   }
 
   size_t cpu_memory_data_use() const;
@@ -451,9 +443,11 @@ class Blob {
    */
   void ShareDiff(const Blob& other);
 
-
   template<typename Dtype>
-  void ToProto(BlobProto* proto, bool write_diff = false) const;
+  void ToProto(BlobProto* proto, bool store_in_old_format, bool write_diff = false) const;
+  template<typename Dtype>
+  void ToProtoBVLC(BlobProto* proto, bool write_diff = false) const;
+
   void FromProto(const BlobProto& proto, bool reshape = true);
   bool ShapeEquals(const BlobProto& other);
   std::string to_string(int indent = 0) const;  // debug helper
 
@@ -441,7 +441,15 @@ class Caffe {
 #endif
   }
 
-  static int thread_count() {
+#ifndef CPU_ONLY
+  /**
+   * Minimum memory available across all deviced currently used
+   * @return size_t
+   */
+  static size_t min_avail_device_memory();
+#endif
+
+    static int thread_count() {
     return thread_count_;
   }
 
 
@@ -1,6 +1,12 @@
 #ifndef CAFFE_DATA_TRANSFORMER_HPP
 #define CAFFE_DATA_TRANSFORMER_HPP
 
+#ifdef USE_OPENCV
+
+#include <opencv2/core/core.hpp>
+
+#endif  // USE_OPENCV
+
 #include <string>
 #include <vector>
 
@@ -49,6 +55,46 @@ class DataTransformer {
   void CopyPtrEntry(shared_ptr<Datum> datum, Dtype* transformed_ptr, size_t& out_sizeof_element,
       bool output_labels, Dtype* label);
 
+#ifdef USE_OPENCV
+  /**
+   * @brief Whether there are any "variable_sized" transformations defined
+   * in the data layer's transform_param block.
+   */
+  bool var_sized_transforms_enabled() const;
+
+  /**
+   * @brief Calculate the final shape from applying the "variable_sized"
+   * transformations defined in the data layer's transform_param block
+   * on the provided image, without actually performing any transformations.
+   *
+   * @param orig_shape
+   *    The shape of the data to be transformed.
+   */
+  vector<int> var_sized_transforms_shape(const vector<int>& orig_shape) const;
+
+  /**
+   * @brief Applies "variable_sized" transformations defined in the data layer's
+   * transform_param block to the data.
+   *
+   * @param old_datum
+   *    The source Datum containing data of arbitrary shape.
+   * @param new_datum
+   *    The destination Datum that will store transformed data of a fixed
+   *    shape. Suitable for other transformations.
+   */
+  shared_ptr<Datum> VariableSizedTransforms(shared_ptr<Datum> old_datum);
+
+  bool        var_sized_image_random_resize_enabled() const;
+  vector<int> var_sized_image_random_resize_shape(const vector<int>& prev_shape) const;
+  cv::Mat&    var_sized_image_random_resize(cv::Mat& img);
+  bool        var_sized_image_random_crop_enabled() const;
+  vector<int> var_sized_image_random_crop_shape(const vector<int>& prev_shape) const;
+  cv::Mat&    var_sized_image_random_crop(const cv::Mat& img);
+  bool        var_sized_image_center_crop_enabled() const;
+  vector<int> var_sized_image_center_crop_shape(const vector<int>& prev_shape) const;
+  cv::Mat&    var_sized_image_center_crop(const cv::Mat& img);
+#endif
+
   /**
    * @brief Applies the transformation defined in the data layer's
    * transform_param block to the data.
@@ -137,6 +183,20 @@ class DataTransformer {
       const std::array<unsigned int, 3>& rand);
 #endif  // USE_OPENCV
 
+  vector<int> InferDatumShape(const Datum& datum);
+#ifdef USE_OPENCV
+  vector<int> InferCVMatShape(const cv::Mat& img);
+#endif  // USE_OPENCV
+
+  /**
+   * @brief Infers the shape of transformed_blob will have when
+   *    the transformation is applied to the data.
+   *
+   * @param bottom_shape
+   *    The shape of the data to be transformed.
+   */
+  vector<int> InferBlobShape(const vector<int>& bottom_shape, bool use_gpu = false);
+
   /**
    * @brief Infers the shape of transformed_blob will have when
    *    the transformation is applied to the data.
@@ -180,6 +240,12 @@ class DataTransformer {
 #ifndef CPU_ONLY
   GPUMemory::Workspace mean_values_gpu_;
 #endif
+#ifdef USE_OPENCV
+  cv::Mat varsz_orig_img_;
+  cv::Mat varsz_rand_resize_img_;
+  cv::Mat varsz_rand_crop_img_;
+  cv::Mat varsz_center_crop_img_;
+#endif
 };
 
 }  // namespace caffe
 
@@ -129,8 +129,6 @@ class LayerBase {
    */
   virtual inline const char* type() const { return ""; }
 
-  virtual bool bias_term() const { return false; }  // FIXME
-
   /**
    * @brief Returns the layer name.
    */
@@ -141,6 +139,7 @@ class LayerBase {
   // Iteration counter maintained by Solver
   int iter() const;
   int relative_iter() const;
+  int iterations_sized() const;
 
   void set_solver_rank(size_t solver_rank) {
     solver_rank_ = solver_rank;
@@ -385,6 +384,8 @@ class LayerBase {
    */
   virtual void ToProto(LayerParameter* param, bool write_diff = false) = 0;
 
+  std::string print_current_device() const;
+
  protected:
   /** The vector that stores the learnable parameters as a set of blobs. */
   vector<shared_ptr<Blob>> blobs_;
@@ -610,17 +611,6 @@ Layer<Ftype, Btype>::Backward(const vector<Blob*>& top, const vector<bool>& prop
   }
 }
 
-// Serialize LayerParameter to protocol buffer
-template<typename Ftype, typename Btype>
-void Layer<Ftype, Btype>::ToProto(LayerParameter* param, bool write_diff) {
-  param->Clear();
-  param->CopyFrom(layer_param_);
-  param->clear_blobs();
-  for (int i = 0; i < blobs_.size(); ++i) {
-    blobs_[i]->ToProto<Btype>(param->add_blobs(), write_diff);
-  }
-}
-
 }  // namespace caffe
 
 #endif  // CAFFE_LAYER_H_
@@ -27,7 +27,6 @@ class BaseConvolutionLayer : public Layer<Ftype, Btype> {
   virtual inline int MinBottomBlobs() const { return 1; }
   virtual inline int MinTopBlobs() const { return 1; }
   virtual inline bool EqualNumBottomTopBlobs() const { return true; }
-  bool bias_term() const override  { return bias_term_; }
 
  protected:
   // Helper functions that abstract away the column buffer and gemm arguments.
 
@@ -119,6 +119,7 @@ class BasePrefetchingDataLayer : public BaseDataLayer<Ftype, Btype>, public Inte
   void InternalThreadEntry() override;
   void InternalThreadEntryN(size_t thread_id) override;
   void ResizeQueues();
+  void AllocatePrefetch();
 
   virtual void InitializePrefetch();
   virtual void load_batch(Batch<Ftype>* batch, int thread_id, size_t queue_id) = 0;
 
@@ -58,7 +58,11 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Ftype, Btype> {
       : ConvolutionLayer<Ftype, Btype>(param), handles_setup_(false),
         use_algo_seeker_(true), use_modest_workspace_(true),
         forward_math_(tpmax<Ftype, float>()), backward_data_math_(tpmax<Btype, float>()),
-        backward_filter_math_(tpmax<Btype, float>()) {}
+        backward_filter_math_(tpmax<Btype, float>()) {
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    cudnn_math_override_ = -1;
+#endif
+  }
 
   virtual void LayerSetUp(const vector<Blob*>& bottom, const vector<Blob*>& top);
   virtual void Reshape(const vector<Blob*>& bottom, const vector<Blob*>& top);
@@ -77,6 +81,11 @@ class CuDNNConvolutionLayer : public ConvolutionLayer<Ftype, Btype> {
   vector<cudnnConvolutionBwdFilterAlgo_t> bwd_filter_algo_;
   vector<cudnnConvolutionBwdDataAlgo_t> bwd_data_algo_;
 
+#if CUDNN_VERSION_MIN(7, 0, 0)
+  int cudnn_math_override_;
+  vector<cudnnMathType_t> fwd_cudnn_math_, bwd_filter_cudnn_math_, bwd_data_cudnn_math_;
+#endif
+
   vector<cudnnTensorDescriptor_t> fwd_bottom_descs_, fwd_top_descs_;
   vector<cudnnTensorDescriptor_t> bwd_bottom_descs_, bwd_top_descs_;
   cudnnTensorDescriptor_t fwd_bias_desc_, bwd_bias_desc_;
 
@@ -29,7 +29,6 @@ class EmbedLayer : public Layer<Ftype, Btype> {
   virtual inline const char* type() const { return "Embed"; }
   virtual inline int ExactNumBottomBlobs() const { return 1; }
   virtual inline int ExactNumTopBlobs() const { return 1; }
-  bool bias_term() const override  { return bias_term_; }
 
  protected:
   virtual void Forward_cpu(const vector<Blob*>& bottom,
 
@@ -28,7 +28,6 @@ class InnerProductLayer : public Layer<Ftype, Btype> {
   virtual inline const char* type() const { return "InnerProduct"; }
   virtual inline int ExactNumBottomBlobs() const { return 1; }
   virtual inline int ExactNumTopBlobs() const { return 1; }
-  bool bias_term() const override  { return bias_term_; }
 
  protected:
   virtual void Forward_cpu(const vector<Blob*>& bottom,
 
@@ -35,7 +35,6 @@ class ScaleLayer: public Layer<Ftype, Btype> {
   virtual inline int MinBottomBlobs() const { return 1; }
   virtual inline int MaxBottomBlobs() const { return 2; }
   virtual inline int ExactNumTopBlobs() const { return 1; }
-  bool bias_term() const override  { return bias_term_; }
 
  protected:
   /**
 
@@ -52,7 +52,9 @@ class SoftmaxWithLossLayer : public LossLayer<Ftype, Btype> {
     *    present; otherwise the loss is simply summed over spatial locations.
     */
   explicit SoftmaxWithLossLayer(const LayerParameter& param)
-      : LossLayer<Ftype, Btype>(param) {}
+      : LossLayer<Ftype, Btype>(param) {
+     prob_ = Blob::create<Ftype>();
+  }
   virtual void LayerSetUp(const vector<Blob*>& bottom,
       const vector<Blob*>& top);
   virtual void Reshape(const vector<Blob*>& bottom,
@@ -110,7 +112,7 @@ class SoftmaxWithLossLayer : public LossLayer<Ftype, Btype> {
   /// The internal SoftmaxLayer used to map predictions to a distribution.
   shared_ptr<Layer<Ftype, Btype> > softmax_layer_;
   /// prob stores the output probability predictions from the SoftmaxLayer.
-  TBlob<Ftype> prob_;  // Conversion if Ftype!=Btype
+  shared_ptr<Blob> prob_;  // Conversion if Ftype!=Btype
   /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
   vector<Blob*> softmax_bottom_vec_;
   /// top vector holder used in call to the underlying SoftmaxLayer::Forward
 
@@ -200,6 +200,11 @@ class Net {
   const vector<string>& param_display_names() const {
     return param_display_names_;
   }
+
+  const pair<int, int>& param_layer_indices(int param_id) {
+    return param_layer_indices_[param_id];
+  }
+
   /// @brief Input and output blob numbers
   int num_inputs() const { return net_input_blobs_.size(); }
   int num_outputs() const { return net_output_blobs_.size(); }
@@ -252,8 +257,6 @@ class Net {
   void InitializeLearnableDiffSpace();
 #endif
 
-  size_t total_batch_size() const;
-
   void wait_layers_init() {
     for (Flag* flag : layer_inititialized_flags_) {
       flag->wait();
@@ -264,6 +267,9 @@ class Net {
     return global_grad_scale_;
   }
 
+  size_t infer_count() const {
+    return infer_count_;
+  }
 
  protected:
   // Helpers for Init.
@@ -378,6 +384,7 @@ class Net {
   vector<Flag*> layer_inititialized_flags_;
   NetParameter net_param_;
 
+  size_t infer_count_;
   float global_grad_scale_;
 
   static constexpr int END_OF_ITERATION = -1;