Reconstruct streams via global registration, temporary impl to unblock FSDP (pytorch#117386)

voznesenskym · pytorchmergebot · commit f008efa8e77a · 2024-01-13T07:03:33.000Z
This is a placeholder implementation for reconstructing streams via global storage to unblock FSDP, pending proper stream support design This PR does a few things: 1) fixes registration for devices with indices. We were only supporting "cuda", we now support "cuda:k" interfaces where k is # of gpu 2) Changes the stream objects in dynamo to take devices as device types, instead of strings, and updates the string based device APIs to gracefully take device types. 3) Introduces a reconstruct-by-global (using existing cleanup hook structures) to streams as a placeholder impl for now Pull Request resolved: pytorch#117386 Approved by: https://github.com/jansel
diff --git a/test/dynamo/test_ctx_manager.py b/test/dynamo/test_ctx_manager.py
@@ -177,6 +177,30 @@ def fn(x):
         self.assertEqual(cnts.frame_count, 1)
         self.assertEqual(cnts.op_count, 9)
 
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
+    def test_cuda_stream_across_graph_break(self):
+        def fn(x):
+            s = torch.cuda.Stream()
+            x = torch.mul(x, 5)
+            x = torch.add(x, 2)
+
+            print("foo")
+            tcs = torch.cuda.stream(s)
+            with tcs:
+                x = torch.relu(x)
+            x = torch.add(x, 1)
+            x = torch.cos(x)
+            return x
+
+        x = torch.randn((2, 2), device="cuda")
+        ref = fn(x)
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(cnts)(fn)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 9)
+
     @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
     def test_cuda_stream_context_manager2(self):
         def fn(x, s):
diff --git a/torch/_dynamo/device_interface.py b/torch/_dynamo/device_interface.py
@@ -174,11 +174,17 @@ def get_compute_capability(device: _device_t = None):
 device_interfaces: Dict[str, Type[DeviceInterface]] = {}
 
 
-def register_interface_for_device(device: str, device_interface: Type[DeviceInterface]):
+def register_interface_for_device(
+    device: Union[str, torch.device], device_interface: Type[DeviceInterface]
+):
+    if isinstance(device, torch.device):
+        device = str(device)
     device_interfaces[device] = device_interface
 
 
-def get_interface_for_device(device: str) -> Type[DeviceInterface]:
+def get_interface_for_device(device: Union[str, torch.device]) -> Type[DeviceInterface]:
+    if isinstance(device, torch.device):
+        device = str(device)
     if device in device_interfaces:
         return device_interfaces[device]
     raise NotImplementedError(f"No interface for device {device}")
@@ -189,3 +195,5 @@ def get_registered_device_interfaces() -> Iterable[Tuple[str, Type[DeviceInterfa
 
 
 register_interface_for_device("cuda", CudaInterface)
+for i in range(torch.cuda.device_count()):
+    register_interface_for_device(f"cuda:{i}", CudaInterface)
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
@@ -815,7 +815,6 @@ def popn(self, n: int) -> List[VariableTracker]:
 
     def LOAD_FAST(self, inst):
         name = inst.argval
-
         if name in self.f_locals and config.replay_record_enabled:
             self.exec_recorder.add_local_var(name, self.f_locals[name])
 
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
@@ -84,6 +84,7 @@
     AutocastModeVariable,
     EventVariable,
     NullContextVariable,
+    StreamContextVariable,
     StreamVariable,
 )
 from .dicts import (
@@ -570,12 +571,17 @@ def build_key_value(k, v):
         elif isinstance(value, HigherOrderOperator):
             self.install_guards(GuardBuilder.TYPE_MATCH, GuardBuilder.NAME_MATCH)
             return TorchHigherOrderOperatorVariable.make(value, source=self.source)
+        elif isinstance(value, torch.cuda.StreamContext):
+            self.install_guards(GuardBuilder.ID_MATCH)
+            stream_source = AttrSource(self.source, "stream")
+            stream_var = VariableBuilder(self.tx, stream_source)(value.stream)
+            return StreamContextVariable.create(self.tx, stream_var)
         elif isinstance(value, _StreamBase):
             self.install_guards(GuardBuilder.ID_MATCH)
             return StreamVariable(
                 None,
                 value,
-                value.device.type,
+                value.device,
                 source=self.source,
             )
         elif isinstance(value, _EventBase):
@@ -1500,9 +1506,7 @@ def _clone_input(value):
         for _, device_interface in get_registered_device_interfaces()
     ]:
         proxy.node.meta["example_value"] = example_value
-        return StreamVariable(
-            proxy, example_value, example_value.device.type, **options
-        )
+        return StreamVariable(proxy, example_value, example_value.device, **options)
     elif (
         inspect.isclass(proxy.node.target) and issubclass(proxy.node.target, _EventBase)
     ) or proxy.node.target in [
diff --git a/torch/_dynamo/variables/ctx_manager.py b/torch/_dynamo/variables/ctx_manager.py
@@ -521,19 +521,13 @@ def exit(self, tx, *args):
         )
         self.state.cleanup_assert()
 
-    def module_name(self):
-        return "torch." + str(self.device)
-
-    def fn_name(self):
-        return "stream"
-
 
 class StreamVariable(VariableTracker):
     def __init__(self, proxy, value, device, **kwargs):
         if proxy is not None and "example_value" in proxy.node.meta:
             assert proxy.node.meta["example_value"] == value
         assert (
-            value.device.type == device
+            value.device.type == device.type
         ), "stream value is not equal to the passed device"
         super().__init__(**kwargs)
         self.proxy = proxy
@@ -586,6 +580,22 @@ def call_method(
     def as_proxy(self):
         return self.proxy
 
+    def reconstruct(self, codegen):
+        # If we got here, this stream is fully subsumed by the graph - this means it is
+        # not an input or global
+        assert not self.source
+        # Since we just proved that - for other such structures, like lists and dicts, reconstruction
+        # is fine and sound according to dynamo principles of treating collectives. However,
+        # streams are special in that we want to preserve the identity of the stream as the same as in the graph
+        # Normally, we would do this via codegen for the proxy mapping to an output - we cannot do this yet, as we do not
+        # yet have a plan for how we want to handle the case where the stream is used as an input or an output. Pending
+        # design, to unblock current work, we lift the stream into a global and then codegen bytecode to load it from there.
+        name = f"_stream_{self.device}_{id(self.value)}"
+        if name not in codegen.tx.output.global_scope:
+            codegen.tx.output.install_global(name, self.value)
+
+        return [codegen.create_load_global(name, push_null=False, add=True)]
+
 
 class EventVariable(VariableTracker):
     def __init__(self, proxy, value, **kwargs):