@@ -291,12 +291,14 @@ def GPU_GPUFuncOp : GPU_Op<"func", [HasParent<"GPUModuleOp">,
291
291
let parser = [{ return parseGPUFuncOp(parser, result); }];
292
292
}
293
293
294
- def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
295
- Arguments<(ins SymbolRefAttr:$kernel,
294
+ def GPU_LaunchFuncOp : GPU_Op<"launch_func",
295
+ [GPU_AsyncOpInterface, AttrSizedOperandSegments]>,
296
+ Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
297
+ SymbolRefAttr:$kernel,
296
298
Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
297
299
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
298
300
Variadic<AnyType>:$operands)>,
299
- Results<(outs)> {
301
+ Results<(outs Optional<GPU_AsyncToken>:$asyncToken )> {
300
302
let summary = "Launches a function as a GPU kernel";
301
303
302
304
let description = [{
@@ -308,14 +310,22 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
308
310
function is required to be a gpu.module. And finally, the module containing
309
311
the kernel module (which thus cannot be the top-level module) is required
310
312
to have the `gpu.container_module` attribute. The `gpu.launch_func`
311
- operation has a symbol attribute named `kernel` to identify the fully
313
+ operation has a symbol attribute named `kernel` to identify the fully
312
314
specified kernel function to launch (both the gpu.module and func).
313
315
314
- The operation takes at least six operands, with the first three operands
315
- being grid sizes along x,y,z dimensions and the following three being block
316
- sizes along x,y,z dimensions. When a lower-dimensional kernel is required,
317
- unused sizes must be explicitly set to `1`. The remaining operands are
318
- passed as arguments to the kernel function.
316
+ The `gpu.launch_func` supports async dependencies: the kernel does not start
317
+ executing until the ops producing those async dependencies have completed.
318
+
319
+ By the default, the host implicitly blocks until kernel execution has
320
+ completed. If the `async` keyword is present, the host does not block but
321
+ instead a `!gpu.async.token` is returned. Other async GPU ops can take this
322
+ token as dependency.
323
+
324
+ The operation requires at least the grid and block sizes along the x,y,z
325
+ dimensions as arguments. When a lower-dimensional kernel is required,
326
+ unused sizes must be explicitly set to `1`.
327
+
328
+ The remaining operands are passed as arguments to the kernel function.
319
329
320
330
Example:
321
331
@@ -351,11 +361,15 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
351
361
}
352
362
}
353
363
364
+ %t0 = gpu.wait async
354
365
gpu.launch_func
355
- @kernels::@kernel_1 // Kernel function.
356
- blocks in (%cst, %cst, %cst) // Grid size.
357
- threads in (%cst, %cst, %cst) // Block size.
358
- args(%arg0 : f32, %arg1 : memref<?xf32, 1>) // Kernel arguments.
366
+ async // (Optional) Don't block host, return token.
367
+ [%t0] // (Optional) Execute only after %t0 has completed.
368
+ @kernels::@kernel_1 // Kernel function.
369
+ blocks in (%cst, %cst, %cst) // Grid size.
370
+ threads in (%cst, %cst, %cst) // Block size.
371
+ args(%arg0 : f32, // (Optional) Kernel arguments.
372
+ %arg1 : memref<?xf32, 1>)
359
373
}
360
374
```
361
375
}];
@@ -401,6 +415,7 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
401
415
402
416
let verifier = [{ return ::verify(*this); }];
403
417
let assemblyFormat = [{
418
+ custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
404
419
$kernel
405
420
`blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)`
406
421
`threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)`
0 commit comments