tedi
diff --git a/‎BUILD.bazel‎
Lines changed: 1 addition & 1 deletion b/‎BUILD.bazel‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/core/boxing/KernelFunction.cpp‎
Lines changed: 4 additions & 3 deletions b/‎aten/src/ATen/core/boxing/KernelFunction.cpp‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎aten/src/ATen/core/boxing/KernelFunction.h‎
Lines changed: 37 additions & 9 deletions b/‎aten/src/ATen/core/boxing/KernelFunction.h‎
Lines changed: 37 additions & 9 deletions
diff --git a/‎aten/src/ATen/core/dispatch/OperatorEntry.cpp‎
Lines changed: 19 additions & 19 deletions b/‎aten/src/ATen/core/dispatch/OperatorEntry.cpp‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎aten/src/ATen/core/op_registration/op_registration_test.cpp‎
Lines changed: 18 additions & 18 deletions b/‎aten/src/ATen/core/op_registration/op_registration_test.cpp‎
Lines changed: 18 additions & 18 deletions
@@ -130,7 +130,7 @@ genrule(
         "aten/src/ATen/RegisterMkldnnCPU.cpp",
         "aten/src/ATen/RegisterQuantizedCPU.cpp",
         "aten/src/ATen/RegisterSparseCPU.cpp",
-        "aten/src/ATen/RegisterMath.cpp",
+        "aten/src/ATen/RegisterCompositeImplicitAutograd.cpp",
         "aten/src/ATen/RegisterMeta.cpp",
         "aten/src/ATen/RegisterDefaultBackend.cpp",
         "aten/src/ATen/RegisterSchema.cpp",
 
@@ -21,9 +21,10 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet,
 
 void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
   TORCH_INTERNAL_ASSERT(0,
-    op.operator_name(), " has kernels registered to both Math and a backend mapped to AutogradOther. "
-    "This makes the backend kernel unreachable (see Note [Ambiguity in AutogradOther kernel]). "
-    "If it's intended to override Math kernel behavior, please open an issue to request a dedicated "
+    op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. "
+    "This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering "
+    "(see Note [Ambiguity in AutogradOther kernel]). "
+    "If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated "
     "Autograd dispatch key for the backend.\n",
     "If you only want to run inference instead of training, add `at::AutoNonVariableTypeMode guard(true);` "
     "before model.forward(). Note this guard is only available in C++ but not Python at present.",
 
@@ -18,15 +18,43 @@ struct OperatorKernel;
 TORCH_API void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
 
 // Note [Ambiguity in AutogradOther kernel]
-// This kernel implements reporting an error message when there're kernels registered
-// to both Math and a backend of AutogradOther, we don't know which kernel to pick:
-// - if we pick Math kernel for AutogradOther, the kernel registered to backend will be
-//   silently ignored and never called.
-// - if we skip using Math kernel for AutogradOther (it might pick Autograd kernel if available),
-//   it'll break all backends mapped to AutogradOther without a direct registration to backend.
-//   See c10/core/DispatchKeySet.cpp for a list of backends mapped to AutogradOther.
-// Thus if backend extender indeed want to override Math kernel behavior, they should request
-// a dedicated Autograd key for their backend to resolve the ambiguity.
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This error-reporting kernel is registered to the AutogradOther entry in the
+// dispatch table when there is both a CompositeImplicitAutograd kernel and a
+// backend kernel for ANY backend that maps to AutogradOther.  To see why
+// this is necessary in the AutogradOther case, it's helpful to first see
+// why everything works out fine for a backend that has a reserved Autograd
+// entry (see rule 2.2 in [Note] DispatchTable computation):
+//
+//    CPU   AutogradCPU
+//    reg?  registers with...
+//    -------------------------------------------------
+//    y     Autograd registration takes precedence
+//          over CompositeImplicitAutograd.
+//          This is good, because the CPU specific backend
+//          implementation is more specialized and typically better;
+//          if we used the composite, we would bypass it.
+//          (NB: the Autograd key is guaranteed to exist because
+//          the autograd codegen requires it!)
+//
+//    n     CompositeImplicitAutograd takes precedence.
+//          This is also good, because the Autograd
+//          registration (if it exists) would try to redispatch
+//          to the (non-existent) CPU implementation; by
+//          using the composite, we ensure the operator
+//          actually works.
+//
+// As you can see, when we have a specific Autograd key (AutogradCPU), we can
+// decide whether or not to use the CompositeImplicitAutograd kernel or the
+// Autograd kernel based on whether or not the backend kernel exists.
+//
+// However, for AutogradOther (which is the catchall autograd kernel for
+// everything that doesn't have a specific Autograd key), we can't do this
+// trick because there isn't any unique backend to peek at to disambiguate;
+// if there are some backends that have implementations they prefer Autograd,
+// but unimplemented backends would prefer CompositeImplicitAutograd.  Rather
+// than arbitrarily pick one or the other, we just register a kernel that raises
+// an error and let the user decide how to proceed.
 TORCH_API void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
 
 // Note [named_not_supported_kernel]
 
@@ -108,8 +108,8 @@ std::list<AnnotatedKernel>::iterator OperatorEntry::registerKernel(
 
   // Add the kernel to the kernels list,
   // possibly creating the list if this is the first kernel.
-  // Redirect catchAll registrations to Math.
-  auto& k = dispatch_key.has_value() ? kernels_[*dispatch_key] : kernels_[DispatchKey::Math];
+  // Redirect catchAll registrations to CompositeImplicitAutograd.
+  auto& k = dispatch_key.has_value() ? kernels_[*dispatch_key] : kernels_[DispatchKey::CompositeImplicitAutograd];
 
   if (k.size() > 0) {
     TORCH_WARN("Overriding a previously registered kernel for the same operator and the same dispatch key\n",
@@ -138,8 +138,8 @@ void OperatorEntry::deregisterKernel_(
   c10::optional<DispatchKey> dispatch_key,
   std::list<AnnotatedKernel>::iterator kernel
 ) {
-  // Redirect catchAll deregistrations to Math.
-  DispatchKey dk = dispatch_key.has_value() ? *dispatch_key : DispatchKey::Math;
+  // Redirect catchAll deregistrations to CompositeImplicitAutograd.
+  DispatchKey dk = dispatch_key.has_value() ? *dispatch_key : DispatchKey::CompositeImplicitAutograd;
   auto found = kernels_.find(dk);
   TORCH_INTERNAL_ASSERT(found != kernels_.end(), "Tried to deregister a kernel for dispatch key ", toString(dispatch_key), " but there are no kernels registered for this dispatch key. The operator is ", toString(name_));
   auto& k = found->second;
@@ -186,13 +186,13 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   //    (2.1) Use kernel from DispatchKey::DefaultBackend if available.
   //          This is used to register a kernel that works for all backend in inference. But it requires
   //          separate registration for Autograd keys to support training.
-  //    (2.2) Use kernel from DispatchKey::Math if available.
-  //          For autograd keys, we only use kernel from Math when there's no direct registration
-  //          to its corresponding backend key or DefaultBackend. See Note [DefaultBackend and Math].
+  //    (2.2) Use kernel from DispatchKey::CompositeImplicitAutograd if available.
+  //          For autograd keys, we only use kernel from CompositeImplicitAutograd when there's no direct registration
+  //          to its corresponding backend key or DefaultBackend. See Note [DefaultBackend and CompositeImplicitAutograd].
   //          For AutogradOther, we eagerly return ambiguousAutogradOtherKernel_ if there's registration to any of
   //          its backends and ask backend extender to request a decicated Autograd key for the backend.
   //          See Note [Ambiguity in AutogradOther kernel] for more details.
-  //          A DefaultBackend kernel prevents Math kernel being used for Autograd keys, but it doesn't
+  //          A DefaultBackend kernel prevents CompositeImplicitAutograd kernel being used for Autograd keys, but it doesn't
   //          cause confusion for AutogradOther. It's pretty straightforward to use Autograd (if available)
   //          in this case.
   //    (2.3) Use kernel from DispatchKey::Autograd if available
@@ -201,11 +201,11 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   //    backend key. See Note [Refresh Runtime Autograd entries in dispatchTable_]
   //  (3) Use fallthrough kernel that are registered as fallback.
   // Alias Key Precedence:
-  //   DefaultBackend > Math > Autograd
-  // Note [DefaultBackend and Math]
-  //   When there're registrations to both DefaultBackend & Math & Autograd, from (2.2) we know DefaultBackend
-  //   and Autograd kernels will be picked up and Math is overriden.
-  //   This is fine and in practice DefaultBackend and Math shouldn't co-exist for an op.
+  //   DefaultBackend > CompositeImplicitAutograd > Autograd
+  // Note [DefaultBackend and CompositeImplicitAutograd]
+  //   When there're registrations to both DefaultBackend & CompositeImplicitAutograd & Autograd, from (2.2) we know DefaultBackend
+  //   and Autograd kernels will be picked up and CompositeImplicitAutograd is overriden.
+  //   This is fine and in practice DefaultBackend and CompositeImplicitAutograd shouldn't co-exist for an op.
   // TODO: Update alias key precedence after we add new alias keys AutogradDispatchCPUOrCUDA .
 
   // 1. Operator registration
@@ -226,13 +226,13 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
   bool has_backend_kernel =
     hasKernelForAnyDispatchKey(getBackendKeySetFromAutograd(dispatch_key).add(DispatchKey::DefaultBackend));
 
-  // 2.2. Use Math kernel if available. For autograd keys, we only use kernel from Math
+  // 2.2. Use CompositeImplicitAutograd kernel if available. For autograd keys, we only use kernel from CompositeImplicitAutograd
   //      when there's no direct registration to its corresponding backend key or DefaultBackend.
   //      For AutogradOther, we return ambiguousAutogradOtherKernel_ if there's registration
   //      to any of its backends.
   //      See Note [Undefined in dispatchTable_] for the special handling for Undefined.
-  if (dispatch_key == DispatchKey::Undefined || isIncludedInAlias(dispatch_key, DispatchKey::Math)) {
-    if (auto math_registration = getKernelForDispatchKey(DispatchKey::Math)) {
+  if (dispatch_key == DispatchKey::Undefined || isIncludedInAlias(dispatch_key, DispatchKey::CompositeImplicitAutograd)) {
+    if (auto math_registration = getKernelForDispatchKey(DispatchKey::CompositeImplicitAutograd)) {
       if (dispatch_key == DispatchKey::AutogradOther
           && hasKernelForAnyDispatchKey(c10::autogradother_backends)) {
         return {ambiguousAutogradOtherKernel_, "ambiguous autogradother"};
@@ -286,9 +286,9 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp
   for (auto k : c10::getRuntimeDispatchKeySet(dispatch_key)) {
     updateDispatchTableEntry_(dispatcher, k);
   }
-  // Registration to DefaultBackend and Math should be populated to Undefined.
+  // Registration to DefaultBackend and CompositeImplicitAutograd should be populated to Undefined.
   // We cannot do this above since Undefined cannot be represented in DispatchKeySet.
-  if (dispatch_key == DispatchKey::Math || dispatch_key == DispatchKey::DefaultBackend) {
+  if (dispatch_key == DispatchKey::CompositeImplicitAutograd || dispatch_key == DispatchKey::DefaultBackend) {
     updateDispatchTableEntry_(dispatcher, DispatchKey::Undefined);
   }
   // Note [Refresh Runtime Autograd entries in dispatchTable_]
@@ -319,7 +319,7 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher)
   //     the error message.
   // In the old world of catchAll, the only way to "register" a kernel to Undefined is by registering it to
   // catchAll. After catchAllKernel_ is removed, Undefined now can get a kernel from either DefaultBackend
-  // or Math alias key so that we don't break the support. Ideally isIncludedInAlias(Undefined, Math)
+  // or CompositeImplicitAutograd alias key so that we don't break the support. Ideally isIncludedInAlias(Undefined, CompositeImplicitAutograd)
   // should return true, it returns false because Undefined cannot be represented in a DispatchKeySet.
   for (uint8_t iter = 0; iter != static_cast<uint8_t>(DispatchKey::NumDispatchKeys); ++iter) {
     updateDispatchTable_(dispatcher, static_cast<DispatchKey>(iter));
 
@@ -520,7 +520,7 @@ TEST(OperatorRegistrationTest, whenRegisteringAutogradKernelWithCatchAllKernel_t
   auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""});
   ASSERT_TRUE(op.has_value());
 
-  // catchAll now maps to Math which has higher precedence than Autograd
+  // catchAll now maps to CompositeImplicitAutograd which has higher precedence than Autograd
   called_nonautograd = called_autograd = false;
   op->typed<void (Tensor)>().call(dummyTensor(DispatchKey::CPU, /*requires_grad=*/true));
   EXPECT_TRUE(called_nonautograd);
@@ -1306,7 +1306,7 @@ TEST(NewOperatorRegistrationTest, whenRegisteringBackendFallbackKernelAndCatchal
 
   called = false;
   auto stack = callOp(*op, dummyTensor(c10::DispatchKey::CPU), "hello ");
-  // CatchAll now maps to Math and has higher precedence than backend fallback.
+  // CatchAll now maps to CompositeImplicitAutograd and has higher precedence than backend fallback.
   EXPECT_TRUE(called);
 }
 
@@ -1325,10 +1325,10 @@ TEST(NewOperatorRegistrationTest, whenRegisteringAutogradKernelWithRegularKernel
   EXPECT_FALSE(called_autograd);
 }
 
-TEST(NewOperatorRegistrationTest, dispatchWithMathKernel) {
+TEST(NewOperatorRegistrationTest, dispatchWithCompositeImplicitAutogradKernel) {
   bool math_called = false;
   auto m = MAKE_TORCH_LIBRARY(test);
-  m.def("fn", torch::dispatch(c10::DispatchKey::Math, [&](const Tensor& x) { math_called = true; return x; }));
+  m.def("fn", torch::dispatch(c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }));
 
   auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
   ASSERT_TRUE(op.has_value());
@@ -1370,17 +1370,17 @@ TEST(NewOperatorRegistrationTest, dispatchWithMathKernel) {
   }
 }
 
-TEST(NewOperatorRegistrationTest, dispatchWithMathAndAutogradKernel) {
+TEST(NewOperatorRegistrationTest, dispatchWithCompositeImplicitAutogradAndAutogradKernel) {
   bool math_called = false;
   bool autograd_called = false;
   auto m = MAKE_TORCH_LIBRARY(test);
-  m.def("fn", torch::dispatch(c10::DispatchKey::Math, [&](const Tensor& x) { math_called = true; return x; }));
+  m.def("fn", torch::dispatch(c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }));
   m.impl("fn", c10::DispatchKey::Autograd, [&](const Tensor& x) { autograd_called = true; return x; });
 
   auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
   ASSERT_TRUE(op.has_value());
 
-  // Math has higher precedence than Autograd
+  // CompositeImplicitAutograd has higher precedence than Autograd
   {
     math_called = autograd_called = false;
     callOp(*op, dummyTensor(c10::DispatchKey::CPU, /*requires_grad=*/true));
@@ -1396,17 +1396,17 @@ TEST(NewOperatorRegistrationTest, dispatchWithMathAndAutogradKernel) {
   }
 }
 
-TEST(NewOperatorRegistrationTest, dispatchWithMathAndCatchAllKernel) {
+TEST(NewOperatorRegistrationTest, dispatchWithCompositeImplicitAutogradAndCatchAllKernel) {
   bool math_called = false;
   bool catchall_called = false;
   auto m = MAKE_TORCH_LIBRARY(test);
-  m.def("fn", torch::dispatch(c10::DispatchKey::Math, [&](const Tensor& x) { math_called = true; return x; }));
+  m.def("fn", torch::dispatch(c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }));
   m.impl("fn", [&](const Tensor& x) { catchall_called = true; return x; });
 
   auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
   ASSERT_TRUE(op.has_value());
 
-  // catchAll now maps to Math, which means we have two registrations to Math key.
+  // catchAll now maps to CompositeImplicitAutograd, which means we have two registrations to CompositeImplicitAutograd key.
   // The last registration is used.
   {
     catchall_called = math_called = false;
@@ -1423,11 +1423,11 @@ TEST(NewOperatorRegistrationTest, dispatchWithMathAndCatchAllKernel) {
   }
 }
 
-TEST(NewOperatorRegistrationTest, AutogradBackendOverridesMathKernel) {
+TEST(NewOperatorRegistrationTest, AutogradBackendOverridesCompositeImplicitAutogradKernel) {
   bool math_called = false;
   bool autograd_called = false;
   auto m = MAKE_TORCH_LIBRARY(test);
-  m.def("fn", torch::dispatch(c10::DispatchKey::Math, [&](const Tensor& x) { math_called = true; return x; }));
+  m.def("fn", torch::dispatch(c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }));
   m.impl("fn", c10::DispatchKey::AutogradCPU, [&](const Tensor& x) { autograd_called = true; return x; });
 
   auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
@@ -1462,11 +1462,11 @@ TEST(NewOperatorRegistrationTest, AutogradBackendOverridesMathKernel) {
   }
 }
 
-TEST(NewOperatorRegistrationTest, BackendOverridesMathKernel) {
+TEST(NewOperatorRegistrationTest, BackendOverridesCompositeImplicitAutogradKernel) {
   bool math_called = false;
   bool backend_called = false;
   auto m = MAKE_TORCH_LIBRARY(test);
-  m.def("fn", torch::dispatch(c10::DispatchKey::Math, [&](const Tensor& x) { math_called = true; return x; }));
+  m.def("fn", torch::dispatch(c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }));
   m.impl("fn", c10::DispatchKey::CPU, [&](const Tensor& x) { backend_called = true; return x; });
 
   auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
@@ -1550,12 +1550,12 @@ TEST(NewOperatorRegistrationTest, dispatchWithDefaultBackendKernel) {
   }
 }
 
-TEST(NewOperatorRegistrationTest, dispatchWithDefaultBackendAndMathKernel) {
+TEST(NewOperatorRegistrationTest, dispatchWithDefaultBackendAndCompositeImplicitAutogradKernel) {
   bool backend_called = false;
   bool math_called = false;
   auto m = MAKE_TORCH_LIBRARY(test);
   m.def("fn", torch::dispatch(c10::DispatchKey::DefaultBackend, [&](const Tensor& x) { backend_called = true; return x; }));
-  m.impl("fn", c10::DispatchKey::Math, [&](const Tensor& x) { math_called = true; return x; });
+  m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; });
 
   auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
   ASSERT_TRUE(op.has_value());
@@ -1735,7 +1735,7 @@ TEST(NewOperatorRegistrationTest, throwsWhenRegisterToBackendMapsToAutogradOther
   bool sparsecpu_called, math_called = false;
   auto m = MAKE_TORCH_LIBRARY(test);
   m.def("fn", torch::dispatch(c10::DispatchKey::SparseCPU, [&](const Tensor& x) { sparsecpu_called = true; return x; }));
-  m.impl("fn", c10::DispatchKey::Math, [&](const Tensor& x) { math_called = true; return x; });
+  m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; });
 
   auto op = Dispatcher::singleton().findSchema({"test::fn", ""});
   ASSERT_TRUE(op.has_value());
@@ -1748,7 +1748,7 @@ TEST(NewOperatorRegistrationTest, throwsWhenRegisterToBackendMapsToAutogradOther
   {
     expectThrows<c10::Error>([&] {
       callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU, /*requires_grad=*/true));
-    }, "test::fn has kernels registered to both Math and a backend mapped to AutogradOther.");
+    }, "test::fn has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther.");
   }
 }