[inductor] fix grid z bug for large grid (pytorch#127448)

FindHao · pytorchmergebot · commit 2a03bf5a14d1 · 2024-05-31T08:01:34.000Z
Fixes pytorch#123210 https://github.com/pytorch/pytorch/blob/2f3d3ddd70e553d4c5269df699489b82b3aa25ab/torch/_inductor/runtime/triton_heuristics.py#L1733-L1753 If a kernel's y_grid is larger than 65535, it will be split into multiple z grids. The above grad_fn does this split before the kernel launch; however, the computations for yoffset and the y_grid are incorrect. For example, if we have xy numel of `(1*XBLOCK, 65537*YBLOCK)`, this function will return an [xyz]_grid with (1, 32768, 2). XBLOCK and YBLOCK here are used for the following `get_grid_dim`. Let's use their default values (4, 1024). https://github.com/pytorch/pytorch/blob/2f3d3ddd70e553d4c5269df699489b82b3aa25ab/torch/_inductor/runtime/triton_heuristics.py#L1734 [xyz]_grid = (1, 32768, 2) means the workload are divided to two z grids. Because the triton kernel generation still follows xy dimension, one of the exampled generated kernel is shown below. ```python @triton.jit def triton_(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): ynumel = 65537*1024 xnumel = 1*4 yoffset = tl.program_id(1) * (tl.program_id(2) + 1) * YBLOCK yindex = yoffset + tl.arange(0, YBLOCK)[None, :] ymask = yindex < ynumel xoffset = tl.program_id(0) * XBLOCK xindex = xoffset + tl.arange(0, XBLOCK)[:, None] xmask = xindex < xnumel x2 = xindex y0 = yindex % 128 y1 = (yindex // 128) y3 = yindex tmp0 = tl.load(in_ptr0 + (y0 + (128*x2) + (512*y1)), xmask, eviction_policy='evict_last') tl.store(out_ptr0 + (x2 + (4*y3)), tmp0, xmask) ``` For a trition block with xyz index (0, 0, 1), its yoffset and xoffset are both 0s based on the compuation `yoffset = tl.program_id(1) * (tl.program_id(2) + 1) * YBLOCK` and `xoffset = tl.program_id(0) * XBLOCK`. So, this triton block will access the very first elements of the input. However, the correct yoffset should be `(y_index + z_index * y_grid ) * YBLOCK` which is the starting position of the 2nd z grid. At the same time, because we used `y_grid = y_grid // div` to compute the maximum number of element in y dimension, the y_grid is 32768. The total y grids is 32768*2 = 65536, which is less than the actual y grids 65537. So, we should use `y_grid = ceildiv(y_grid, div)` to compute the y grid to save the remaining grids. pytorch#123210 is not about AOTInductor, the root cause is the triton kernel generated by torchinductor. Pull Request resolved: pytorch#127448 Approved by: https://github.com/eellison
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
@@ -968,29 +968,19 @@ class Model(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
-            def forward(self, primals_1, primals_2, primals_5):
-                view = torch.ops.aten.reshape.default(primals_5, [-1, 4, 128])
+            def forward(self, primals_5):
+                view = torch.ops.aten.reshape.default(primals_5, [-1, 2, 4])
                 primals_5 = None
                 permute = torch.ops.aten.permute.default(view, [0, 2, 1])
                 clone = torch.ops.aten.clone.default(
                     permute, memory_format=torch.contiguous_format
                 )
-                permute = None
-                view_1 = torch.ops.aten.reshape.default(clone, [-1, 4])
-                clone = None
-                permute_1 = torch.ops.aten.permute.default(primals_1, [1, 0])
-                primals_1 = None
-                addmm = torch.ops.aten.addmm.default(primals_2, view_1, permute_1)
-                primals_2 = None
-                return addmm
-
-        s0 = 727828
-        s1 = 512
-        example_inputs = (
-            torch.rand(2, 4, device=self.device),
-            torch.rand(2, device=self.device),
-            torch.rand(s0, s1, device=self.device),
-        )
+                return clone
+
+        # let y_grid = 65537
+        s0 = 16777472
+        s1 = 8
+        example_inputs = (torch.rand(s0, s1, device=self.device),)
         self.check_model(Model(), example_inputs)
 
     def test_cond_simple(self):
@@ -3065,7 +3055,6 @@ def fail_non_abi_compatible_cuda(is_skip=False):
 
 CUDA_TEST_FAILURES = {
     # test_failures, xfail by default, set is_skip=True to skip
-    "test_large_grid": fail_cuda(),
     "test_normal_functional": fail_abi_compatible_cuda(is_skip=True),
     # no runtime checks for non_abi_compatible mode
     "test_runtime_checks": fail_non_abi_compatible_cuda(is_skip=True),
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -10341,6 +10341,23 @@ def test_generate_rand_fp8(self):
         t = rand_strided((2, 3), (3, 1), device=self.device, dtype=torch.float8_e4m3fn)
         self.assertTrue(t.dtype is torch.float8_e4m3fn)
 
+    def test_large_grid(self):
+        # https://github.com/pytorch/pytorch/issues/123210
+        def fn(primals_5):
+            view = torch.ops.aten.reshape.default(primals_5, [-1, 2, 4])
+            primals_5 = None
+            permute = torch.ops.aten.permute.default(view, [0, 2, 1])
+            clone = torch.ops.aten.clone.default(
+                permute, memory_format=torch.contiguous_format
+            )
+            return clone
+
+        s0 = 16777472
+        s1 = 8
+        compiled_fn = torch._dynamo.optimize()(fn)
+        actual = compiled_fn(torch.ones(s0, s1))
+        self.assertTrue((actual == 1).all())
+
 
 @dataclasses.dataclass
 class TestFailure:
diff --git a/test/inductor/test_triton_heuristics.py b/test/inductor/test_triton_heuristics.py
@@ -38,7 +38,7 @@ def test_triton_config(self):
 
     def _test_artificial_zgrid(self):
         def forward(primals_1, primals_2, primals_5):
-            view = torch.ops.aten.reshape.default(primals_5, [-1, 4, 128])
+            view = torch.ops.aten.reshape.default(primals_5, [-1, 2, 4])
             primals_5 = None
             permute = torch.ops.aten.permute.default(view, [0, 2, 1])
             clone = torch.ops.aten.clone.default(
@@ -53,8 +53,8 @@ def forward(primals_1, primals_2, primals_5):
             primals_2 = None
             return addmm
 
-        s0 = 727828
-        s1 = 512
+        s0 = 16777472
+        s1 = 8
 
         args = [
             torch.rand([2, 4], device=GPU_TYPE),
@@ -73,7 +73,6 @@ def forward(primals_1, primals_2, primals_5):
         ]
         self.assertEqual(forward(*args), foo_c(*args))
 
-    @unittest.skip("https://github.com/pytorch/pytorch/issues/123210")
     @expectedFailureXPU
     def test_artificial_zgrid(self):
         self._test_artificial_zgrid()
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -2341,7 +2341,10 @@ def iteration_ranges_get_pid(self, entry):
             and not entry.has_zdim
             and not (isinstance(entry.numel, int) and entry.numel <= get_max_y_grid())
         ):
-            key = f"{key} * (tl.program_id({entry.grid_dim + 1}) + 1)"
+            # For ynumel larger than max_ygrid, we need to use zdim.
+            # For each z dimension, there are tl.num_programs(1) yblocks which is passed by grad(x,y,z).
+            # So, we need to add tl.program_id(z) * tl.num_programs(y) *YBLOCK to get the correct yoffset.
+            key = f"({key} + tl.program_id({entry.grid_dim + 1}) * tl.num_programs({entry.grid_dim}))"
         pid = entry.pid_cache.get(key, key)
         if self.index_dtype != "tl.int32":
             return f"{pid}.to({self.index_dtype})"
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -1738,7 +1738,7 @@ def grid_fn(meta):
         max_y_grid = get_max_y_grid()
         if znumel is None:
             div = ceildiv(y_grid, max_y_grid)
-            y_grid = y_grid // div
+            y_grid = ceildiv(y_grid, div)
             z_grid = div
         else:
             z_grid = get_grid_dim(znumel, meta.get("ZBLOCK", None))