[inductor] Remove usage of device_interface from _inductor.runtime (pytorch#124592)

masnesral · pytorchmergebot · commit 254128c16e2a · 2024-04-30T16:54:16.000Z
Differential Revision: [D56723770](https://our.internmc.facebook.com/intern/diff/D56723770) Co-authored-by: Sam Larsen <slarsen@meta.com> Pull Request resolved: pytorch#124592 Approved by: https://github.com/masnesral
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
@@ -14,6 +14,7 @@
 from torch._dynamo.utils import same
 from torch._inductor import config
 from torch._inductor.compile_fx import compile_fx_inner
+from torch._inductor.runtime.hints import DeviceProperties
 from torch._inductor.utils import run_and_get_code
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
@@ -405,7 +406,7 @@ def decorator(fn):
             ],
             meta={
                 "signature": {0: "*fp32", 1: "*fp32", 2: "i32"},
-                "device": 0,
+                "device": DeviceProperties.create(torch.device("cuda")),
                 "configs": [instance_descriptor(divisible_by_16=(0, 1), equal_to_1=())],
                 "constants": {},
             },
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
@@ -45,16 +45,12 @@
     Optional,
     Set,
     Tuple,
-    Type,
     TYPE_CHECKING,
     Union,
 )
 
 import torch
-from torch._dynamo.device_interface import (
-    get_interface_for_device,
-    get_registered_device_interfaces,
-)
+from torch._dynamo.device_interface import get_registered_device_interfaces
 from torch._dynamo.utils import counters, dynamo_timed
 from torch._inductor import config, exc, metrics
 from torch._inductor.codegen.cuda import cuda_env
@@ -70,7 +66,6 @@
 from torch.fx.experimental.symbolic_shapes import has_hint, hint_int, ShapeEnv
 
 if TYPE_CHECKING:
-    from torch._dynamo.device_interface import DeviceInterface
     from torch._inductor.graph import GraphLowering
     from torch._inductor.ir import ChoiceCaller
 
@@ -2823,14 +2818,9 @@ def _set_triton_ptxas_path() -> None:
 
 def _worker_compile_triton(
     load_kernel: Callable[[], Any],
-    cc: int,
-    device: torch.device,
-    device_interface: Type[DeviceInterface],
 ):
     _set_triton_ptxas_path()
-    device_interface.Worker.set_device(device.index)
-    kernel = load_kernel()
-    kernel.precompile(warm_cache_only_with_cc=cc)
+    load_kernel().precompile(warm_cache_only=True)
 
 
 class CodeCacheFuture:
@@ -2993,17 +2983,13 @@ def triton(self, kernel_name: str, source_code: str, device_str: str = "cuda"):
 
         kernel = TritonCodeCache.load(kernel_name, source_code)
         if config.compile_threads > 1:
-            device_interface = get_interface_for_device(device_str)
-            device = torch.device(device_str, device_interface.current_device())
-            cc = device_interface.get_compute_capability(device)
-            future = self.process_pool().submit(
-                _worker_compile_triton,
-                kernel._reload_in_subproc,
-                cc,
-                device,
-                device_interface,
+            return TritonFuture(
+                kernel,
+                self.process_pool().submit(
+                    _worker_compile_triton,
+                    kernel._reload_in_subproc,
+                ),
             )
-            return TritonFuture(kernel, future)
         else:
             kernel.precompile()
             return kernel
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -34,7 +34,7 @@
 from torch._dynamo.utils import preserve_rng_state
 
 from torch._inductor.metrics import is_metric_table_enabled, log_kernel_metadata
-from torch._inductor.runtime.hints import AutotuneHint
+from torch._inductor.runtime.hints import AutotuneHint, DeviceProperties
 from torch._prims_common import is_integer_dtype
 from torch.utils._sympy.functions import FloorDiv, ModularIndexing
 from torch.utils._sympy.value_ranges import ValueRanges
@@ -125,7 +125,7 @@ def gen_common_triton_imports():
         """
         from torch._inductor.runtime import triton_helpers, triton_heuristics
         from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor
+        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties
         """
     )
     return imports.getvalue()
@@ -2833,8 +2833,7 @@ def codegen_kernel(self, name=None):
         )
         triton_meta = {
             "signature": triton_meta_signature,
-            "device": V.graph.scheduler.current_device.index,
-            "device_type": V.graph.scheduler.current_device.type,
+            "device": DeviceProperties.create(V.graph.scheduler.current_device),
             "constants": {},
         }
 
diff --git a/torch/_inductor/codegen/triton_foreach.py b/torch/_inductor/codegen/triton_foreach.py
@@ -6,6 +6,7 @@
 from sympy import Integer
 
 from .. import metrics
+from ..runtime.hints import DeviceProperties
 from ..scheduler import SchedulerNode
 from ..utils import ceildiv, Placeholder
 from ..virtualized import V
@@ -157,8 +158,7 @@ def jit_lines(self):
         _, _, signature = self.args.python_argdefs()
         triton_meta = {
             "signature": signature_to_meta(signature, size_dtype=size_dtype),
-            "device": V.graph.scheduler.current_device.index,
-            "device_type": V.graph.scheduler.current_device.type,
+            "device": DeviceProperties.create(V.graph.scheduler.current_device),
             "constants": {},
         }
         triton_meta["configs"] = [config_of(signature)]
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
@@ -40,6 +40,7 @@
 from .. import codecache, config, ir
 from ..ir import ReinterpretView
 from ..runtime import triton_heuristics
+from ..runtime.hints import DeviceProperties
 from ..utils import (
     cache_on_self,
     get_benchmark_name,
@@ -1130,8 +1131,7 @@ def define_user_defined_triton_kernel(self, kernel, configs, kwargs):
                 size_dtype=index_dtype,
                 indices=non_constant_indices,
             ),
-            "device": V.graph.scheduler.current_device.index,
-            "device_type": V.graph.scheduler.current_device.type,
+            "device": DeviceProperties.create(V.graph.scheduler.current_device),
             # Triton compiler includes equal_to_1 args into constants even
             # when they are not constexpr. otherwise there may be a segfault
             # during launching the Inductor-compiled Triton kernel.
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
@@ -1,6 +1,8 @@
 import collections
+import typing
 from dataclasses import fields
 from enum import auto, Enum
+from typing import Optional
 
 
 # NOTE: if these fail asserts submit a PR to increase them
@@ -89,3 +91,39 @@ class AutotuneHint(Enum):
     # which isn't valid python.
     # Enum.__str__ will just return "AutotuneHint.ELEMENTS_PER_WARP_32".
     __repr__ = Enum.__str__
+
+
+class DeviceProperties(typing.NamedTuple):
+    """Copy device properties into a data structure not requiring torch to be imported"""
+
+    type: str  # type: ignore[assignment]
+    index: int  # type: ignore[assignment]
+    cc: int
+    major: Optional[int] = None
+    regs_per_multiprocessor: Optional[int] = None
+    max_threads_per_multi_processor: Optional[int] = None
+    multi_processor_count: Optional[int] = None
+
+    @classmethod
+    def create(cls, device):
+        import torch
+        from torch._dynamo.device_interface import get_interface_for_device
+
+        device_type = device.type if torch.version.hip is None else "hip"
+        device_interface = get_interface_for_device(device)
+        if device_type == "cuda":
+            props = device_interface.get_device_properties(device)
+            return cls(
+                type=device_type,
+                index=device.index,
+                cc=device_interface.get_compute_capability(device),
+                major=props.major,
+                regs_per_multiprocessor=props.regs_per_multiprocessor,
+                max_threads_per_multi_processor=props.max_threads_per_multi_processor,
+                multi_processor_count=props.multi_processor_count,
+            )
+        return cls(
+            type=device_type,
+            index=device.index,
+            cc=device_interface.get_compute_capability(device),
+        )
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -16,12 +16,12 @@
 
 import torch
 
-from torch._dynamo.device_interface import DeviceGuard, get_interface_for_device
 from .coordinate_descent_tuner import CoordescTuner
 
 from .hints import (
     _NUM_THREADS_PER_WARP,
     AutotuneHint,
+    DeviceProperties,
     HeuristicType,
     ReductionHint,
     TileHint,
@@ -144,21 +144,19 @@ def __init__(
 
         assert len(configs) > 0, "Non-empty TritonConfig list required for compiling"
         self.fn = fn
-        self.triton_meta = triton_meta
+        self.device_props: DeviceProperties = triton_meta["device"]
+        self.triton_meta = {
+            **triton_meta,
+            "device": self.device_props.index,
+            "device_type": self.device_props.type,
+        }
         self.inductor_meta = {} if inductor_meta is None else inductor_meta
         self.save_cache_hook = save_cache_hook
         self.mutated_arg_names = mutated_arg_names
         self.configs = configs
         self.heuristic_type = heuristic_type
         self.custom_kernel = custom_kernel
         self.cuda_kernel_saved = False
-
-        # Align the default design that default as cuda
-        self.device_type = (
-            triton_meta["device_type"] if "device_type" in triton_meta else "cuda"
-        )
-        self.device_interface = get_interface_for_device(self.device_type)
-
         if log.isEnabledFor(logging.DEBUG):
             log.debug(
                 "CachingAutotuner gets %d configs for %s",
@@ -186,7 +184,7 @@ def __init__(
         )
         self.filename = filename
 
-    def precompile(self, warm_cache_only_with_cc=None):
+    def precompile(self, warm_cache_only=False):
         with self.lock:
             if self.launchers:
                 return
@@ -197,7 +195,7 @@ def precompile(self, warm_cache_only_with_cc=None):
             for c in self.configs:
                 try:
                     compiled_binary, launcher = self._precompile_config(
-                        c, warm_cache_only_with_cc
+                        c, warm_cache_only
                     )
                 except OutOfResources as e:
                     if len(self.configs) == 1:
@@ -215,19 +213,19 @@ def precompile(self, warm_cache_only_with_cc=None):
 
             seen_configs = set(self.configs)
 
-            device_prop = self.device_interface.Worker.get_device_properties(
-                self.triton_meta["device"]
-            )
+            device_prop = self.device_props
             if (
                 self.inductor_meta.get("dynamic_scale_rblock", True)
                 and self.heuristic_type == HeuristicType.REDUCTION
                 and self.size_hints is not None
-                # Disable for AMDGPU as Triton is not ready to return n_regs for a compiled_binary.
-                and not self.inductor_meta.get("is_hip")
-                # Disable for Intel GPU as Triton is not ready to return n_regs for a compiled_binary.
-                and self.device_type != "xpu"
+                # Disable for AMDGPU/Intel as Triton is not ready to return n_regs for a compiled_binary.
+                and device_prop.type == "cuda"
+                and device_prop.major
                 and device_prop.major >= 8
             ):
+                assert device_prop.regs_per_multiprocessor
+                assert device_prop.max_threads_per_multi_processor
+                assert device_prop.multi_processor_count
                 for triton_config, compiled_binary in zip(
                     self.configs, compiled_binaries
                 ):
@@ -288,15 +286,21 @@ def precompile(self, warm_cache_only_with_cc=None):
                         continue
                     seen_configs.add(new_config)
                     self.launchers.append(
-                        self._precompile_config(new_config, warm_cache_only_with_cc)[1]
+                        self._precompile_config(new_config, warm_cache_only)[1]
                     )
             self.configs = None
 
-    def _precompile_config(self, cfg: Config, warm_cache_only_with_cc: Optional[int]):
+    def get_device_interface(self):
+        # this code cannot run in compile workers, because it imports from torch
+        from torch._dynamo.device_interface import get_interface_for_device
+
+        return get_interface_for_device(self.device_props.type.replace("hip", "cuda"))
+
+    def _precompile_config(self, cfg: Config, warm_cache_only: bool):
         """Ahead of time compile a given autotuner config."""
         compile_meta = copy.deepcopy(self.triton_meta)
         for k, v in cfg.kwargs.items():
-            if torch.version.hip is not None:
+            if self.device_props.type != "hip":
                 if k == "matrix_instr_nonkdim":
                     compile_meta["matrix_instr_nonkdim"] = v
                     continue
@@ -310,22 +314,9 @@ def _precompile_config(self, cfg: Config, warm_cache_only_with_cc: Optional[int]
             "assert_indirect_indexing", True
         ) and not self.inductor_meta.get("is_hip", False)
 
-        # Setting device_type="hip" required on ROCm to pass down to triton
-        compile_meta["device_type"] = (
-            self.device_type if torch.version.hip is None else "hip"
-        )
-
-        if warm_cache_only_with_cc:
-            cc = warm_cache_only_with_cc
-        else:
-            # Use device_type 'cuda' for both cuda and hip devices to retrieve
-            # the compute capability.
-            device_type = self.device_type if torch.version.hip is None else "cuda"
-            device_id = compile_meta["device"]
-            device = torch.device(device_type, device_id)
-            cc = self.device_interface.get_compute_capability(device)
-
-        compile_meta["cc"] = cc
+        # device type will be "hip" rather than "cuda" here
+        compile_meta["device_type"] = self.device_props.type
+        compile_meta["cc"] = self.device_props.cc
 
         if ASTSource:
             compile_args = (
@@ -354,7 +345,7 @@ def _precompile_config(self, cfg: Config, warm_cache_only_with_cc: Optional[int]
                 "num_stages": compile_meta["num_stages"],
                 "debug": compile_meta["debug"],
             }
-            if torch.version.hip is not None:
+            if self.device_props.type != "hip":
                 if "waves_per_eu" in compile_meta:
                     options["waves_per_eu"] = compile_meta["waves_per_eu"]
                 if "matrix_instr_nonkdim" in compile_meta:
@@ -369,16 +360,21 @@ def _precompile_config(self, cfg: Config, warm_cache_only_with_cc: Optional[int]
             compile_args = (self.fn,)
             compile_kwargs = compile_meta
 
-        if warm_cache_only_with_cc:
+        if warm_cache_only:
             return (
                 triton.compile(*compile_args, **compile_kwargs),
                 None,
             )
 
+        # importing from torch is safe now that precompile has returned
+        from torch._dynamo.device_interface import DeviceGuard
+
+        device_interface = self.get_device_interface()
+
         # load binary to the correct device
-        with DeviceGuard(self.device_interface, compile_meta["device"]):  # type: ignore[attr-defined]
+        with DeviceGuard(device_interface, compile_meta["device"]):  # type: ignore[attr-defined]
             # need to initialize context
-            self.device_interface.synchronize(self.device_interface.current_device())
+            device_interface.synchronize(device_interface.current_device())
 
             try:
                 binary = triton.compile(*compile_args, **compile_kwargs)
@@ -596,8 +592,9 @@ def bench(self, launcher, *args, grid, **kwargs):
             )
             return float("inf")
 
-        stream = self.device_interface.get_raw_stream(  # type: ignore[call-arg]
-            self.device_interface.current_device()
+        device_interface = self.get_device_interface()
+        stream = device_interface.get_raw_stream(  # type: ignore[call-arg]
+            device_interface.current_device()
         )
 
         def kernel_call():
@@ -706,7 +703,7 @@ def save_cuda_kernel(self, grid, stream, launcher):
 
         binary = (
             launcher.bin.asm["cubin"]
-            if torch.version.hip is None
+            if self.device_props.type != "hip"
             else launcher.bin.asm["hsaco"]
         )
         CudaKernelParamCache.set(key, params, binary)
@@ -736,7 +733,7 @@ def coordinate_descent_tuning(self, launcher, *args, **kwargs):
 
         def benchmark_one_config(config):
             with self.lock:
-                _, launcher = self._precompile_config(config, None)
+                _, launcher = self._precompile_config(config, False)
             config2launcher[config] = launcher
 
             out = self.bench(launcher, *cloned_args, **kwargs)
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py