Avoid defining RpcCUDAFuture subclass in TensorPipe agent (pytorch#56513)

lw · facebook-github-bot · commit 7dec14a491ba · 2021-04-21T13:58:43.000-07:00
Summary: Pull Request resolved: pytorch#56513 The RpcCUDAFuture class existed solely to support extracting DataPtrs from a Message class. However, this can be done more simply by using a vanilla CUDAFuture and just extracting those DataPtrs before marking it complete and passing them to markCompleted. This allows to make the DataPtr extraction logic of CUDAFuture private again. ghstack-source-id: 127035771 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D27861064 fbshipit-source-id: b0b4df2cab7be6b4b16d5cfc888483c18fbce60e
diff --git a/aten/src/ATen/cuda/CUDAFuture.h b/aten/src/ATen/cuda/CUDAFuture.h
@@ -22,7 +22,7 @@
 namespace at {
 namespace cuda {
 
-struct TORCH_CUDA_CPP_API CUDAFuture : at::ivalue::Future {
+struct TORCH_CUDA_CPP_API CUDAFuture final : at::ivalue::Future {
  public:
   CUDAFuture(at::TypePtr type) : at::ivalue::Future(std::move(type)) {
     // Use current device to initialize currentDevice_. This is necessary
@@ -129,7 +129,8 @@ struct TORCH_CUDA_CPP_API CUDAFuture : at::ivalue::Future {
     }
   }
 
-  virtual std::vector<std::reference_wrapper<const at::DataPtr>> extractDataPtrs(
+ private:
+  std::vector<std::reference_wrapper<const at::DataPtr>> extractDataPtrs(
       const at::IValue& value) {
     at::IValue::HashAliasedIValues sub_values;
     // Prefer getSubValues() over visit() as the latter is a silent no-op for
@@ -145,7 +146,6 @@ struct TORCH_CUDA_CPP_API CUDAFuture : at::ivalue::Future {
     return data_ptrs;
   }
 
- private:
   // The device that was current when markCompleted was called, which we'll
   // restore when invoking callbacks.
   c10::DeviceIndex currentDevice_;
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -1321,8 +1321,13 @@ void TensorPipeAgent::markFutureAsComplete(
                      message{std::move(message)},
                      ctx{std::move(ctx)}]() mutable {
       MultiStreamGuard guard(ctx);
+      std::vector<std::reference_wrapper<const at::DataPtr>> data_ptrs;
+      for (const auto& tensor : message.tensors()) {
+        data_ptrs.emplace_back(tensor.storage().data_ptr());
+      }
       atomicFuture->jitFuture->markCompleted(
-          IValue(c10::make_intrusive<Message>(std::move(message))));
+          IValue(c10::make_intrusive<Message>(std::move(message))),
+          std::move(data_ptrs));
       // The future's callbacks may schedule further RPCs, increasing the count.
       // Thus we must decrease it after completing the future, otherwise it may
       // briefly dip to zero and trick join into thinking all work is done.
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -276,28 +276,6 @@ class TensorPipeAgent : public RpcAgent {
       const std::string& remoteName,
       const Message& message) const;
 
-#ifdef USE_CUDA_NOT_ROCM
-  // An RPC-specific CUDAFuture subclass. It overrides the extractDataPtrs
-  // function to handle and only handle RPC Messages.
-  struct TORCH_CUDA_CPP_API RpcCUDAFuture final : at::cuda::CUDAFuture {
-   public:
-    using at::cuda::CUDAFuture::CUDAFuture;
-
-   protected:
-    std::vector<std::reference_wrapper<const at::DataPtr>> extractDataPtrs(
-        const at::IValue& value) override {
-      const auto message = value.toCustomClass<Message>();
-      TORCH_INTERNAL_ASSERT(
-          message, "Passed a non-Message type to RpcCUDAFuture");
-      std::vector<std::reference_wrapper<const at::DataPtr>> data_ptrs;
-      for (const auto& tensor : message->tensors()) {
-        data_ptrs.emplace_back(tensor.storage().data_ptr());
-      }
-      return data_ptrs;
-    }
-  };
-#endif
-
   // When a request+response completes, we need to mark the future message as
   // complete. However, if its timeout has already expired, it already has an
   // error set. There is no atomic "test-and-set" way to mark a future complete
@@ -308,7 +286,8 @@ class TensorPipeAgent : public RpcAgent {
     AtomicJitFuture(bool noCuda = true) {
 #ifdef USE_CUDA_NOT_ROCM
       if (!noCuda) {
-        jitFuture = std::make_shared<RpcCUDAFuture>(at::AnyClassType::get());
+        jitFuture =
+            std::make_shared<at::cuda::CUDAFuture>(at::AnyClassType::get());
       } else {
 #else
       {