NVIDIA
diff --git a/‎3rdparty/half_float/half.hpp
+22-23 b/‎3rdparty/half_float/half.hpp
+22-23
diff --git a/‎CMakeLists.txt
+3 b/‎CMakeLists.txt
+3
diff --git a/‎Makefile
+2-1 b/‎Makefile
+2-1
diff --git a/‎cmake/Cuda.cmake
+3-1 b/‎cmake/Cuda.cmake
+3-1
diff --git a/‎cmake/Modules/FindNVML.cmake
+2-6 b/‎cmake/Modules/FindNVML.cmake
+2-6
diff --git a/‎cmake/lint.cmake
+1-1 b/‎cmake/lint.cmake
+1-1
diff --git a/‎common_plot.py
+44 b/‎common_plot.py
+44
diff --git a/‎include/caffe/blob.hpp
+16-71 b/‎include/caffe/blob.hpp
+16-71
@@ -197,7 +197,16 @@
 #endif
 
 #ifdef __CUDA_ARCH__
-#include "caffe/util/gpu_math_functions.cuh"
+  #include "caffe/util/half.cuh"
+  #include "caffe/util/gpu_math_functions.cuh"
+#endif
+
+#if !defined(CPU_ONLY) && defined(__CUDA_ARCH__)
+  #define CAFFE_UTIL_HD __host__ __device__
+  #define CAFFE_UTIL_IHD __inline__ __host__ __device__
+#else
+  #define CAFFE_UTIL_HD
+  #define CAFFE_UTIL_IHD inline
 #endif
 
 /// Default rounding mode.
@@ -956,29 +965,24 @@ namespace half_float
 		friend struct std::hash<half>;
 	#endif
 
-	public:
+   public:
 		/// Default constructor.
 		/// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics
 		/// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics.
 		HALF_CONSTEXPR
 		CAFFE_UTIL_HD
 		half() : data_() {}
 
+    template<typename H>
     CAFFE_UTIL_HD
-    __half geth() const {
-      __half h;
-      h.x = data_;
-      return h;
+    const H* gethp() const {
+      return reinterpret_cast<const H*>(&data_);
     }
 
+    template<typename H>
     CAFFE_UTIL_HD
-    const __half* gethp() const {
-      return reinterpret_cast<const __half*>(this);
-    }
-
-    CAFFE_UTIL_HD
-    __half* gethp() {
-      return reinterpret_cast<__half*>(this);
+    H* gethp() {
+      return reinterpret_cast<H*>(&data_);
     }
 
     CAFFE_UTIL_HD
@@ -995,18 +999,13 @@ namespace half_float
   	/// Copy constructor.
 		/// \tparam T type of concrete half expression
 		/// \param rhs half expression to copy from
-//		half(detail::expr rhs) : data_(detail::float2half<round_style>(rhs)) {}
-
     CAFFE_UTIL_HD
 		half(detail::expr rhs) {
       assign(rhs);
     }
 
 		/// Conversion constructor.
 		/// \param rhs float to convert
-//		template<typename T>
-//		half(const T& rhs) : data_(detail::float2half<round_style>((float)rhs)) {}
-
     template<typename T>
     CAFFE_UTIL_HD
     half(const T& rhs) {
@@ -1030,8 +1029,8 @@ namespace half_float
 //		operator float() const { return detail::half2float(data_); }
     CAFFE_UTIL_HD operator float() const {
 #ifdef __CUDA_ARCH__
-      __half h;
-      h.x = data_;
+      ::half h;
+      h.setx(data_);
       return __half2float(h);
 #else
       return detail::half2float(data_);
@@ -1040,7 +1039,7 @@ namespace half_float
 
     CAFFE_UTIL_HD void assign(float rhs) {
 #ifdef __CUDA_ARCH__
-      data_ = float2half_clip(rhs).x;
+      data_ = float2half_clip(rhs).x();
 #else
       data_ = detail::float2half<round_style>(rhs);
 #endif
@@ -1117,9 +1116,9 @@ namespace half_float
       float after = static_cast<float>(*this);
       if (before == after && before != 0.f && rhs != 0.f) {
 #ifdef __CUDA_ARCH__
-        CUPRINTF("GPU PRECISION LOSS: %g -= %g\n", before, rhs);
+        printf("GPU PRECISION LOSS: %g -= %g\n", before, rhs);
 #else
-        CUPRINTF("CPU PRECISION LOSS: %g -= %g\n", before, rhs);
+        printf("CPU PRECISION LOSS: %g -= %g\n", before, rhs);
 #endif
       }
 #else
 
@@ -14,6 +14,9 @@ set(CAFFE_TARGET_VERSION "0.16.1")
 set(CAFFE_TARGET_SOVERSION "0.16")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
+# Skip `typedef __half half;`
+add_definitions(-DCUDA_NO_HALF=1)
+
 # ---[ Using cmake scripts and modules
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 
@@ -42,8 +42,9 @@ DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_SONAME_SHORT).$(DYNAMIC_VERSION_REVISION)
 DYNAMIC_NAME := $(LIB_BUILD_DIR)/$(DYNAMIC_VERSIONED_NAME_SHORT)
 COMMON_FLAGS += -DCAFFE_VERSION=$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
-# FP16 Caffe requires C++ 11
+# NVCaffe requires C++ 11
 COMMON_FLAGS += -std=c++11
+COMMON_FLAGS += -DCUDA_NO_HALF
 
 ##############################
 # Get all source files
 
@@ -56,7 +56,7 @@ endfunction()
 #   caffe_select_nvcc_arch_flags(out_variable)
 function(caffe_select_nvcc_arch_flags out_variable)
   # List of arch names
-  set(__archs_names "Fermi" "Kepler" "Maxwell" "All" "Manual")
+  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "All" "Manual")
   set(__archs_name_default "All")
   if(NOT CMAKE_CROSSCOMPILING)
     list(APPEND __archs_names "Auto")
@@ -91,6 +91,8 @@ function(caffe_select_nvcc_arch_flags out_variable)
     set(__cuda_arch_bin "50")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
     set(__cuda_arch_bin "60 61 62")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    set(__cuda_arch_bin "70")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(__cuda_arch_bin ${Caffe_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
 
@@ -9,12 +9,8 @@
 #  NVML_LIBRARY
 
 file (GLOB MLPATH /usr/lib/nvidia-???)
-
-find_path(NVML_INCLUDE_DIR NAMES nvml.h
-    PATHS  ${CUDA_INCLUDE_DIRS} ${NVML_ROOT_DIR}/include
-    )
-
-find_library(NVML_LIBRARY nvidia-ml PATHS ${MLPATH} ${NVML_ROOT_DIR}/lib ${NVML_ROOT_DIR}/lib64)
+find_path(NVML_INCLUDE_DIR NAMES nvml.h PATHS  ${CUDA_INCLUDE_DIRS} ${NVML_ROOT_DIR}/include)
+find_library(NVML_LIBRARY nvidia-ml PATHS ${MLPATH} /usr/local/cuda/lib64/stubs ${NVML_ROOT_DIR}/lib ${NVML_ROOT_DIR}/lib64)
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(NVML DEFAULT_MSG NVML_INCLUDE_DIR NVML_LIBRARY)
 
@@ -1,7 +1,7 @@
 
 set(CMAKE_SOURCE_DIR ..)
 set(LINT_COMMAND ${CMAKE_SOURCE_DIR}/scripts/cpp_lint.py)
-set(SRC_FILE_EXTENSIONS h hpp hu c cpp cu cc)
+set(SRC_FILE_EXTENSIONS h hpp hu c cpp cu cc cuh)
 set(EXCLUDE_FILE_EXTENSTIONS pb.h pb.cc)
 set(LINT_DIRS include src/caffe examples tools python matlab)
 
 
@@ -9,8 +9,14 @@
 def get_test_accuracy(log, top_k):
     iteration = re.findall(r'Iteration (\d*), Testing net \(#0\)', log)
     accuracy = re.findall(r'Test net output #\d: accuracy/top-{top_k} = (\d*.\d*)'.format(top_k=top_k), log)
+    if len(accuracy)==0:
+        accuracy = re.findall(r'Test net output #\d: top-{top_k} = (\d*.\d*)'.format(top_k=top_k), log)
     if len(accuracy)==0:
         accuracy = re.findall(r'Test net output #\d: loss/top-{top_k} = (\d*.\d*)'.format(top_k=top_k), log)
+    if len(accuracy)==0:
+        accuracy = re.findall(r'Test net output #\d: accuracy/top{top_k} = (\d*.\d*)'.format(top_k=top_k), log)
+    if len(accuracy)==0:
+        accuracy = re.findall(r'Test net output #\d: accuracy = (\d*.\d*)', log)
     iteration = [int(i) for i in iteration]
     accuracy = [float(i) for i in accuracy]
     return iteration, accuracy
@@ -25,6 +31,13 @@ def get_test_loss(log):
     loss = [float(i) for i in loss]
     return iteration, loss
 
+def get_train_loss(log):
+    iteration = re.findall(r'Iteration (\d*), lr = ', log)
+    loss = re.findall(r'Train net output #\d: loss = (\d*.\d*)', log)
+    iteration = [int(i) for i in iteration]
+    loss = [float(i) for i in loss]
+    return iteration, loss
+
 
 def get_net_name(log):
     return re.findall(r"Solving (.*)\n", log)[0]
@@ -44,13 +57,22 @@ def parse_files(files, top_k=1, separate=False):
                 data[net_name]["loss"] = {}
                 data[net_name]["loss"]["loss"] = []
                 data[net_name]["loss"]["iteration"] = []
+                data[net_name]["train_loss"] = {}
+                data[net_name]["train_loss"]["loss"] = []
+                data[net_name]["train_loss"]["iteration"] = []
+
             iteration, accuracy = get_test_accuracy(log, top_k)
             data[net_name]["accuracy"]["iteration"].extend(iteration)
             data[net_name]["accuracy"]["accuracy"].extend(accuracy)
 
             iteration, loss = get_test_loss(log)
             data[net_name]["loss"]["iteration"].extend(iteration)
             data[net_name]["loss"]["loss"].extend(loss)
+
+            iteration, loss = get_train_loss(log)
+            data[net_name]["train_loss"]["iteration"].extend(iteration)
+            data[net_name]["train_loss"]["loss"].extend(loss)
+
     return data
 
 
@@ -172,3 +194,25 @@ def plot_loss(data, value_at_hover=False):
     plt.xlim(0)
     plt.grid()
     return plt
+
+def plot_train_loss(data, value_at_hover=False):
+    nets =  data.keys()
+    colors = iter(cm.rainbow(np.linspace(0, 1, len(nets))))
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    for net in nets:
+        iteration = data[net]["train_loss"]["iteration"]
+        loss = data[net]["train_loss"]["loss"]
+        iteration, loss = (list(t) for t in zip(*sorted(zip(iteration, loss))))
+        ax.scatter(iteration, loss, color=next(colors))
+        if value_at_hover:
+            cursor = FollowDotCursor(ax, iteration, loss)
+
+    plt.legend(nets, loc='upper right')
+    plt.title("Log Loss")
+    plt.xlabel("Iteration")
+    plt.ylabel("Log Loss")
+    plt.xlim(0)
+    plt.grid()
+    return plt
+
@@ -36,49 +36,6 @@ class TBlob;
  */
 class Blob {
  public:
-  // This proxy makes sure that we can't rely on cached values while pointer
-  // to data is being used and data potentially might be changed.
-  // When pointer is actually given, proxy flushes the cache.
-  // There are use cases where we "preliminary convert data" but don't change it yet.
-  // In such cases cache is still valid until we really change data.
-  // For example, this line doesn't change blob's state:
-  //   Blob::PtrProxy<Ftype> top_data = top[i]->mutable_gpu_data<Ftype>();
-  // The state will be changed at the moment of passing a raw pointer to,
-  // let say, CuDNN routine.
-  template<typename Ptype>
-  class PtrProxy {
-   public:
-    PtrProxy() : tensor_(), is_gpu_(false), zero_new_mem_(true) {}
-
-    PtrProxy(shared_ptr<Tensor> tensor, bool is_gpu, bool zero_new_mem = true)
-        : tensor_(tensor), is_gpu_(is_gpu), zero_new_mem_(zero_new_mem) {}
-
-    operator Ptype*() {
-      CHECK(tensor_);
-      return reinterpret_cast<Ptype*>(tensor_->mutable_memory(tp<Ptype>(), is_gpu_, zero_new_mem_));
-    }
-
-    ~PtrProxy() {}
-
-    PtrProxy(PtrProxy&& other) : tensor_(std::move(other.tensor_)), is_gpu_(other.is_gpu_),
-                                 zero_new_mem_(other.zero_new_mem_) {}
-
-    PtrProxy& operator=(PtrProxy&& other) {
-      tensor_ = std::move(other.tensor_);
-      is_gpu_ = other.is_gpu_;
-      zero_new_mem_ = other.zero_new_mem_;
-      return *this;
-    }
-
-    PtrProxy(const PtrProxy&) = delete;
-    PtrProxy& operator=(const PtrProxy& other) = delete;
-
-   private:
-    shared_ptr<Tensor> tensor_;
-    bool is_gpu_;
-    bool zero_new_mem_;
-  };
-
   void Swap(Blob& other) noexcept {
     std::swap(data_tensor_, other.data_tensor_);
     std::swap(diff_tensor_, other.diff_tensor_);
@@ -387,26 +344,15 @@ class Blob {
   }
 
   template<typename Dtype>
-  PtrProxy<Dtype> mutable_cpu_data(bool zero_new_mem = true) {
+  Dtype* mutable_cpu_data() {
     convert_data(tp<Dtype>());
-    return PtrProxy<Dtype>(data_tensor_, false, zero_new_mem);
+    return static_cast<Dtype*>(data_tensor_->mutable_synced_mem()->mutable_cpu_data());
   }
 
   template<typename Dtype>
-  PtrProxy<Dtype> mutable_cpu_diff(bool zero_new_mem = true) {
+  Dtype* mutable_cpu_diff() {
     convert_diff(tp<Dtype>());
-    return PtrProxy<Dtype>(diff_tensor_, false, zero_new_mem);
-  }
-
-  // pycaffe needs these two, do NOT use them anywhere else
-  template<typename Dtype>
-  Dtype* mutable_cpu_data_raw() {
-    return (Dtype*) Blob::mutable_cpu_data<Dtype>();
-  }
-
-  template<typename Dtype>
-  Dtype* mutable_cpu_diff_raw() {
-    return (Dtype*) Blob::mutable_cpu_diff<Dtype>();
+    return static_cast<Dtype*>(diff_tensor_->mutable_synced_mem()->mutable_cpu_data());
   }
 
   // Element-wise accessor. Might be slow due to syncing from GPU to CPU.
@@ -572,15 +518,15 @@ class Blob {
   }
 
   template<typename Dtype>
-  PtrProxy<Dtype> mutable_gpu_data(bool zero_new_mem = true) {
+  Dtype* mutable_gpu_data() {
     convert_data(tp<Dtype>());
-    return PtrProxy<Dtype>(data_tensor_, true, zero_new_mem);
+    return static_cast<Dtype*>(data_tensor_->mutable_synced_mem()->mutable_gpu_data());
   }
 
   template<typename Dtype>
-  PtrProxy<Dtype> mutable_gpu_diff(bool zero_new_mem = true) {
+  Dtype* mutable_gpu_diff() {
     convert_diff(tp<Dtype>());
-    return PtrProxy<Dtype>(diff_tensor_, true, zero_new_mem);
+    return static_cast<Dtype*>(diff_tensor_->mutable_synced_mem()->mutable_gpu_data());
   }
 
   void async_gpu_push() {
@@ -701,19 +647,18 @@ class TBlob : public Blob {
   }
 
   template<typename T = Dtype>
-  PtrProxy <T> mutable_cpu_data(bool zero_new_mem = true) {
+  T* mutable_cpu_data() {
     check_integrity(true, data_type(), tp<T>());
-    return Blob::mutable_cpu_data<T>(zero_new_mem);
+    return Blob::mutable_cpu_data<T>();
   }
 
   template<typename T = Dtype>
-  PtrProxy <T> mutable_cpu_diff(bool zero_new_mem = true) {
+  T* mutable_cpu_diff() {
     check_integrity(false, diff_type(), tp<T>());
-    return Blob::mutable_cpu_diff<T>(zero_new_mem);
+    return Blob::mutable_cpu_diff<T>();
   }
 
 #ifndef CPU_ONLY
-
   template<typename T = Dtype>
   const T* gpu_data() const {
     check_integrity(true, data_type(), tp<T>());
@@ -727,15 +672,15 @@ class TBlob : public Blob {
   }
 
   template<typename T = Dtype>
-  PtrProxy <T> mutable_gpu_data(bool zero_new_mem = true) {
+  T* mutable_gpu_data() {
     check_integrity(true, data_type(), tp<T>());
-    return Blob::mutable_gpu_data<T>(zero_new_mem);
+    return Blob::mutable_gpu_data<T>();
   }
 
   template<typename T = Dtype>
-  PtrProxy <T> mutable_gpu_diff(bool zero_new_mem = true) {
+  T* mutable_gpu_diff() {
     check_integrity(false, diff_type(), tp<T>());
-    return Blob::mutable_gpu_diff<T>(zero_new_mem);
+    return Blob::mutable_gpu_diff<T>();
   }
 #endif