rk7697
diff --git a/‎aten/src/ATen/ThreadLocalState.cpp
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/ThreadLocalState.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎aten/src/ATen/ThreadLocalState.h
Lines changed: 4 additions & 0 deletions b/‎aten/src/ATen/ThreadLocalState.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/PythonFallbackKernel.cpp
Lines changed: 11 additions & 0 deletions b/‎aten/src/ATen/core/PythonFallbackKernel.cpp
Lines changed: 11 additions & 0 deletions
diff --git a/‎aten/src/ATen/core/dispatch/Dispatcher.h
Lines changed: 19 additions & 2 deletions b/‎aten/src/ATen/core/dispatch/Dispatcher.h
Lines changed: 19 additions & 2 deletions
diff --git a/‎c10/core/DispatchKey.cpp
Lines changed: 4 additions & 0 deletions b/‎c10/core/DispatchKey.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎c10/core/DispatchKey.h
Lines changed: 4 additions & 0 deletions b/‎c10/core/DispatchKey.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎c10/core/DispatchKeySet.h
Lines changed: 3 additions & 1 deletion b/‎c10/core/DispatchKeySet.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎c10/core/SafePyObject.cpp
Lines changed: 5 additions & 0 deletions b/‎c10/core/SafePyObject.cpp
Lines changed: 5 additions & 0 deletions
diff --git a/‎c10/core/SafePyObject.h
Lines changed: 25 additions & 0 deletions b/‎c10/core/SafePyObject.h
Lines changed: 25 additions & 0 deletions
diff --git a/‎c10/core/impl/PyInterpreter.cpp
Lines changed: 7 additions & 0 deletions b/‎c10/core/impl/PyInterpreter.cpp
Lines changed: 7 additions & 0 deletions
@@ -14,6 +14,7 @@ ThreadLocalState::ThreadLocalState()
       debug_info_(c10::ThreadLocalDebugInfo::current()),
       functorch_tls_(functorch::getCopyOfFuncTorchTLS()),
       autograd_tls_(c10::AutogradState::get_tls_state()),
+      python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()),
       python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()) {
   rf_tls_ = at::get_record_function_tls_();
 
@@ -41,6 +42,8 @@ void ThreadLocalState::setThreadLocalState(
 
   at::SavedTensorDefaultHooks::set_stack(state.saved_tensors_default_hooks_);
 
+  c10::impl::PythonDispatcherTLS::set_state(state.python_dispatcher_state_);
+
   c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
 
   c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);
 
@@ -10,6 +10,7 @@
 #include <ATen/FuncTorchTLS.h>
 #include <ATen/PythonTorchFunctionTLS.h>
 #include <ATen/record_function.h>
+#include <c10/core/impl/PythonDispatcherTLS.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 
 namespace at {
@@ -57,6 +58,9 @@ class TORCH_API ThreadLocalState {
   // TLS for enable_torch_dispatch_mode
   std::shared_ptr<SafePyObject> torch_dispatch_mode_state_;
 
+  // TLS for enable_python_dispatcher
+  SafePyHandle python_dispatcher_state_;
+
   // TLS for __torch_function__ (mode and disable_torch_function)
   at::impl::PythonTorchFunctionTLS python_torch_function_state_;
 
 
@@ -1,4 +1,5 @@
 #include <c10/core/impl/TorchDispatchModeTLS.h>
+#include <c10/core/impl/PythonDispatcherTLS.h>
 #include <ATen/core/PythonFallbackKernel.h>
 #include <c10/core/SafePyObject.h>
 
@@ -87,6 +88,12 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
   TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)");
 }
 
+void pythonDispatcherFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
+  auto state = c10::impl::PythonDispatcherTLS::get_state();
+  TORCH_INTERNAL_ASSERT(state, "Hit PythonDispatcher dispatch key but PythonDispatcherTLS was not set");
+  state.pyinterpreter()->python_dispatcher(op, dispatch_keys.remove(c10::DispatchKey::PythonDispatcher), stack);
+}
+
 void pythonTLSSnapshotFallback(const c10::OperatorHandle &op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
   // It is ok for the tls to be already set here.
   // It means that there are multiple calls into the dispatcher not originating from python code.
@@ -134,6 +141,10 @@ TORCH_LIBRARY_IMPL(_, Python, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>());
 }
 
+TORCH_LIBRARY_IMPL(_, PythonDispatcher, m) {
+  m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonDispatcherFallback>());
+}
+
 TORCH_LIBRARY_IMPL(_, PythonTLSSnapshot, m) {
   m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonTLSSnapshotFallback>());
 }
@@ -168,6 +168,12 @@ class TORCH_API Dispatcher final {
   // See Note [Plumbing Keys Through The Dispatcher]
   void redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const;
 
+  bool hasBackendFallbackForDispatchKey(DispatchKey dk) {
+    auto dispatch_ix = getDispatchTableIndexForDispatchKey(dk);
+    if (dispatch_ix < 0) return false;
+    return backendFallbackKernels_[dispatch_ix].kernel.isValid();
+  }
+
 
   // ------------------------------------------------------------------------
   //
@@ -333,6 +339,10 @@ class TORCH_API OperatorHandle {
     return operatorDef_->op.hasKernelForDispatchKey(k);
   }
 
+  bool hasKernelForAnyDispatchKey(DispatchKeySet k) const {
+    return operatorDef_->op.hasKernelForAnyDispatchKey(k);
+  }
+
   bool hasComputedKernelForDispatchKey(DispatchKey k) const {
     return operatorDef_->op.hasComputedKernelForDispatchKey(k);
   }
@@ -635,11 +645,18 @@ inline void Dispatcher::callBoxedForDispatchKey(const OperatorHandle& op, Dispat
   // We still compute this as we're obligated to pass it on to the internal
   // kernel, if it is a boxed fallback
   auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
-  const auto& kernel = entry.kernelForDispatchKey(dk);
+  const auto& kernel = ([&]() {
+    if (op.hasKernelForDispatchKey(dk)) {
+      return entry.kernelForDispatchKey(dk);
+    } else {
+      auto idx = getDispatchTableIndexForDispatchKey(dk);
+      TORCH_INTERNAL_ASSERT(idx >= 0);
+      return backendFallbackKernels_[idx].kernel;
+    }
+  })();
   kernel.callBoxed(op, dispatchKeySet, stack);
 }
 
-
 inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const {
   // note: this doesn't need the mutex because write operations on the list keep iterators intact.
   const auto& entry = op.operatorDef_->op;
 
@@ -172,6 +172,9 @@ const char* toString(DispatchKey t) {
     case DispatchKey::TESTING_ONLY_GenericMode:
       return "TESTING_ONLY_GenericMode";
 
+    case DispatchKey::PythonDispatcher:
+      return "PythonDispatcher";
+
       // Aliases
 
     case DispatchKey::Autograd:
@@ -283,6 +286,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
       {"TESTING_ONLY_GenericWrapper",
        c10::DispatchKey::TESTING_ONLY_GenericWrapper},
       {"TESTING_ONLY_GenericMode", c10::DispatchKey::TESTING_ONLY_GenericMode},
+      {"PythonDispatcher", c10::DispatchKey::PythonDispatcher},
 
       {"CPU", c10::DispatchKey::CPU},
       {"CUDA", c10::DispatchKey::CUDA},
 
@@ -401,6 +401,10 @@ enum class DispatchKey : uint16_t {
   // for a usage example
   TESTING_ONLY_GenericMode,
 
+  // This is a bypass that allows you to skip running the C++ dispatcher
+  // entirely
+  PythonDispatcher,
+
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
   EndOfFunctionalityKeys, // End of functionality keys.
 
 
@@ -172,7 +172,9 @@ class DispatchKeySet final {
             (1ULL
              << (num_backends + static_cast<uint8_t>(toFunctionalityKey(t)) -
                  1)) -
-            1) {}
+            1) {
+    *this = add(DispatchKey::PythonDispatcher);
+  }
 
   // Public version of DispatchKeySet(uint64_t) API; external users
   // must be explicit when they do this!
 
@@ -8,4 +8,9 @@ PyObject* SafePyObject::ptr(const c10::impl::PyInterpreter* interpreter) const {
   return data_;
 }
 
+PyObject* SafePyHandle::ptr(const c10::impl::PyInterpreter* interpreter) const {
+  TORCH_INTERNAL_ASSERT(interpreter == pyinterpreter_);
+  return data_;
+}
+
 } // namespace c10
@@ -42,4 +42,29 @@ struct C10_API SafePyObject {
   c10::impl::PyInterpreter* pyinterpreter_;
 };
 
+// Like SafePyObject, but non-owning.  Good for references to global PyObjects
+// that will be leaked on interpreter exit.  You get a copy constructor/assign
+// this way.
+struct C10_API SafePyHandle {
+  SafePyHandle() : data_(nullptr), pyinterpreter_(nullptr) {}
+  SafePyHandle(PyObject* data, c10::impl::PyInterpreter* pyinterpreter)
+      : data_(data), pyinterpreter_(pyinterpreter) {}
+
+  c10::impl::PyInterpreter& pyinterpreter() const {
+    return *pyinterpreter_;
+  }
+  PyObject* ptr(const c10::impl::PyInterpreter*) const;
+  void reset() {
+    data_ = nullptr;
+    pyinterpreter_ = nullptr;
+  }
+  operator bool() {
+    return data_;
+  }
+
+ private:
+  PyObject* data_;
+  c10::impl::PyInterpreter* pyinterpreter_;
+};
+
 } // namespace c10
@@ -27,6 +27,13 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
     PANIC(dispatch);
   }
 
+  void python_dispatcher(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet,
+      torch::jit::Stack* stack) const override {
+    PANIC(python_dispatcher);
+  }
+
   bool is_contiguous(const TensorImpl* self) const override {
     PANIC(is_contiguous);
   }
Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,9 @@ PyObject* SafePyObject::ptr(const c10::impl::PyInterpreter* interpreter) const {`
`8`	`8`	`return data_;`
`9`	`9`	`}`
`10`	`10`
	`11`	`+PyObject* SafePyHandle::ptr(const c10::impl::PyInterpreter* interpreter) const {`
	`12`	`+ TORCH_INTERNAL_ASSERT(interpreter == pyinterpreter_);`
	`13`	`+ return data_;`
	`14`	`+}`
	`15`	`+`
`11`	`16`	`} // namespace c10`
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,13 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {`
`27`	`27`	`PANIC(dispatch);`
`28`	`28`	`}`
`29`	`29`
	`30`	`+ void python_dispatcher(`
	`31`	`+ const c10::OperatorHandle& op,`
	`32`	`+ c10::DispatchKeySet,`
	`33`	`+ torch::jit::Stack* stack) const override {`
	`34`	`+ PANIC(python_dispatcher);`
	`35`	`+ }`
	`36`	`+`
`30`	`37`	`bool is_contiguous(const TensorImpl* self) const override {`
`31`	`38`	`PANIC(is_contiguous);`
`32`	`39`	`}`