@@ -26,6 +26,11 @@ enum OpenMPRTLFunctionNVPTX {
26
26
OMPRTL_NVPTX__kmpc_kernel_init,
27
27
// / \brief Call to void __kmpc_kernel_deinit();
28
28
OMPRTL_NVPTX__kmpc_kernel_deinit,
29
+ // / \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
30
+ // / short RequiresOMPRuntime, short RequiresDataSharing);
31
+ OMPRTL_NVPTX__kmpc_spmd_kernel_init,
32
+ // / \brief Call to void __kmpc_spmd_kernel_deinit();
33
+ OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
29
34
// / \brief Call to void __kmpc_kernel_prepare_parallel(void
30
35
// / *outlined_function);
31
36
OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
@@ -76,6 +81,25 @@ class NVPTXActionTy final : public PrePostActionTy {
76
81
CGF.EmitRuntimeCall (ExitCallee, ExitArgs);
77
82
}
78
83
};
84
+
85
+ // A class to track the execution mode when codegening directives within
86
+ // a target region. The appropriate mode (generic/spmd) is set on entry
87
+ // to the target region and used by containing directives such as 'parallel'
88
+ // to emit optimized code.
89
+ class ExecutionModeRAII {
90
+ private:
91
+ CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
92
+ CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
93
+
94
+ public:
95
+ ExecutionModeRAII (CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
96
+ CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
97
+ : Mode(Mode) {
98
+ SavedMode = Mode;
99
+ Mode = NewMode;
100
+ }
101
+ ~ExecutionModeRAII () { Mode = SavedMode; }
102
+ };
79
103
} // anonymous namespace
80
104
81
105
// / Get the GPU warp size.
@@ -116,12 +140,17 @@ static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
116
140
static void syncCTAThreads (CodeGenFunction &CGF) { getNVPTXCTABarrier (CGF); }
117
141
118
142
// / Get the value of the thread_limit clause in the teams directive.
119
- // / The runtime encodes thread_limit in the launch parameter, always starting
120
- // / thread_limit+warpSize threads per team.
121
- static llvm::Value *getThreadLimit (CodeGenFunction &CGF) {
143
+ // / For the 'generic' execution mode, the runtime encodes thread_limit in
144
+ // / the launch parameters, always starting thread_limit+warpSize threads per
145
+ // / CTA. The threads in the last warp are reserved for master execution.
146
+ // / For the 'spmd' execution mode, all threads in a CTA are part of the team.
147
+ static llvm::Value *getThreadLimit (CodeGenFunction &CGF,
148
+ bool IsInSpmdExecutionMode = false ) {
122
149
CGBuilderTy &Bld = CGF.Builder ;
123
- return Bld.CreateSub (getNVPTXNumThreads (CGF), getNVPTXWarpSize (CGF),
124
- " thread_limit" );
150
+ return IsInSpmdExecutionMode
151
+ ? getNVPTXNumThreads (CGF)
152
+ : Bld.CreateSub (getNVPTXNumThreads (CGF), getNVPTXWarpSize (CGF),
153
+ " thread_limit" );
125
154
}
126
155
127
156
// / Get the thread id of the OMP master thread.
@@ -159,12 +188,33 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
159
188
CGM.SetInternalFunctionAttributes (/* D=*/ nullptr , WorkerFn, *CGFI);
160
189
}
161
190
191
+ bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode () const {
192
+ return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
193
+ }
194
+
195
+ static CGOpenMPRuntimeNVPTX::ExecutionMode
196
+ getExecutionModeForDirective (CodeGenModule &CGM,
197
+ const OMPExecutableDirective &D) {
198
+ OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind ();
199
+ switch (DirectiveKind) {
200
+ case OMPD_target:
201
+ return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
202
+ case OMPD_target_parallel:
203
+ return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
204
+ default :
205
+ llvm_unreachable (" Unsupported directive on NVPTX device." );
206
+ }
207
+ llvm_unreachable (" Unsupported directive on NVPTX device." );
208
+ }
209
+
162
210
void CGOpenMPRuntimeNVPTX::emitGenericKernel (const OMPExecutableDirective &D,
163
211
StringRef ParentName,
164
212
llvm::Function *&OutlinedFn,
165
213
llvm::Constant *&OutlinedFnID,
166
214
bool IsOffloadEntry,
167
215
const RegionCodeGenTy &CodeGen) {
216
+ ExecutionModeRAII ModeRAII (CurrentExecutionMode,
217
+ CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
168
218
EntryFunctionState EST;
169
219
WorkerFunctionState WST (CGM);
170
220
Work.clear ();
@@ -252,6 +302,94 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
252
302
EST.ExitBB = nullptr ;
253
303
}
254
304
305
+ void CGOpenMPRuntimeNVPTX::emitSpmdKernel (const OMPExecutableDirective &D,
306
+ StringRef ParentName,
307
+ llvm::Function *&OutlinedFn,
308
+ llvm::Constant *&OutlinedFnID,
309
+ bool IsOffloadEntry,
310
+ const RegionCodeGenTy &CodeGen) {
311
+ ExecutionModeRAII ModeRAII (CurrentExecutionMode,
312
+ CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
313
+ EntryFunctionState EST;
314
+
315
+ // Emit target region as a standalone region.
316
+ class NVPTXPrePostActionTy : public PrePostActionTy {
317
+ CGOpenMPRuntimeNVPTX &RT;
318
+ CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
319
+ const OMPExecutableDirective &D;
320
+
321
+ public:
322
+ NVPTXPrePostActionTy (CGOpenMPRuntimeNVPTX &RT,
323
+ CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
324
+ const OMPExecutableDirective &D)
325
+ : RT(RT), EST(EST), D(D) {}
326
+ void Enter (CodeGenFunction &CGF) override {
327
+ RT.emitSpmdEntryHeader (CGF, EST, D);
328
+ }
329
+ void Exit (CodeGenFunction &CGF) override {
330
+ RT.emitSpmdEntryFooter (CGF, EST);
331
+ }
332
+ } Action (*this , EST, D);
333
+ CodeGen.setAction (Action);
334
+ emitTargetOutlinedFunctionHelper (D, ParentName, OutlinedFn, OutlinedFnID,
335
+ IsOffloadEntry, CodeGen);
336
+ return ;
337
+ }
338
+
339
+ void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader (
340
+ CodeGenFunction &CGF, EntryFunctionState &EST,
341
+ const OMPExecutableDirective &D) {
342
+ auto &Bld = CGF.Builder ;
343
+
344
+ // Setup BBs in entry function.
345
+ llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock (" .execute" );
346
+ EST.ExitBB = CGF.createBasicBlock (" .exit" );
347
+
348
+ // Initialize the OMP state in the runtime; called by all active threads.
349
+ // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
350
+ // based on code analysis of the target region.
351
+ llvm::Value *Args[] = {getThreadLimit (CGF, /* IsInSpmdExecutionMode=*/ true ),
352
+ /* RequiresOMPRuntime=*/ Bld.getInt16 (1 ),
353
+ /* RequiresDataSharing=*/ Bld.getInt16 (1 )};
354
+ CGF.EmitRuntimeCall (
355
+ createNVPTXRuntimeFunction (OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
356
+ CGF.EmitBranch (ExecuteBB);
357
+
358
+ CGF.EmitBlock (ExecuteBB);
359
+ }
360
+
361
+ void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter (CodeGenFunction &CGF,
362
+ EntryFunctionState &EST) {
363
+ if (!EST.ExitBB )
364
+ EST.ExitBB = CGF.createBasicBlock (" .exit" );
365
+
366
+ llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock (" .omp.deinit" );
367
+ CGF.EmitBranch (OMPDeInitBB);
368
+
369
+ CGF.EmitBlock (OMPDeInitBB);
370
+ // DeInitialize the OMP state in the runtime; called by all active threads.
371
+ CGF.EmitRuntimeCall (
372
+ createNVPTXRuntimeFunction (OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
373
+ CGF.EmitBranch (EST.ExitBB );
374
+
375
+ CGF.EmitBlock (EST.ExitBB );
376
+ EST.ExitBB = nullptr ;
377
+ }
378
+
379
+ // Create a unique global variable to indicate the execution mode of this target
380
+ // region. The execution mode is either 'generic', or 'spmd' depending on the
381
+ // target directive. This variable is picked up by the offload library to setup
382
+ // the device appropriately before kernel launch. If the execution mode is
383
+ // 'generic', the runtime reserves one warp for the master, otherwise, all
384
+ // warps participate in parallel work.
385
+ static void setPropertyExecutionMode (CodeGenModule &CGM, StringRef Name,
386
+ CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
387
+ (void )new llvm::GlobalVariable (
388
+ CGM.getModule (), CGM.Int8Ty , /* isConstant=*/ true ,
389
+ llvm::GlobalValue::WeakAnyLinkage,
390
+ llvm::ConstantInt::get (CGM.Int8Ty , Mode), Name + Twine (" _exec_mode" ));
391
+ }
392
+
255
393
void CGOpenMPRuntimeNVPTX::emitWorkerFunction (WorkerFunctionState &WST) {
256
394
auto &Ctx = CGM.getContext ();
257
395
@@ -385,6 +523,22 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
385
523
RTLFn = CGM.CreateRuntimeFunction (FnTy, " __kmpc_kernel_deinit" );
386
524
break ;
387
525
}
526
+ case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
527
+ // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
528
+ // short RequiresOMPRuntime, short RequiresDataSharing);
529
+ llvm::Type *TypeParams[] = {CGM.Int32Ty , CGM.Int16Ty , CGM.Int16Ty };
530
+ llvm::FunctionType *FnTy =
531
+ llvm::FunctionType::get (CGM.VoidTy , TypeParams, /* isVarArg*/ false );
532
+ RTLFn = CGM.CreateRuntimeFunction (FnTy, " __kmpc_spmd_kernel_init" );
533
+ break ;
534
+ }
535
+ case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
536
+ // Build void __kmpc_spmd_kernel_deinit();
537
+ llvm::FunctionType *FnTy =
538
+ llvm::FunctionType::get (CGM.VoidTy , llvm::None, /* isVarArg*/ false );
539
+ RTLFn = CGM.CreateRuntimeFunction (FnTy, " __kmpc_spmd_kernel_deinit" );
540
+ break ;
541
+ }
388
542
case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
389
543
// / Build void __kmpc_kernel_prepare_parallel(
390
544
// / void *outlined_function);
@@ -463,12 +617,27 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
463
617
464
618
assert (!ParentName.empty () && " Invalid target region parent name!" );
465
619
466
- emitGenericKernel (D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
467
- CodeGen);
620
+ CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
621
+ getExecutionModeForDirective (CGM, D);
622
+ switch (Mode) {
623
+ case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
624
+ emitGenericKernel (D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
625
+ CodeGen);
626
+ break ;
627
+ case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
628
+ emitSpmdKernel (D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
629
+ CodeGen);
630
+ break ;
631
+ case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
632
+ llvm_unreachable (
633
+ " Unknown programming model for OpenMP directive on NVPTX target." );
634
+ }
635
+
636
+ setPropertyExecutionMode (CGM, OutlinedFn->getName (), Mode);
468
637
}
469
638
470
639
CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX (CodeGenModule &CGM)
471
- : CGOpenMPRuntime(CGM) {
640
+ : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
472
641
if (!CGM.getLangOpts ().OpenMPIsDevice )
473
642
llvm_unreachable (" OpenMP NVPTX can only handle device code." );
474
643
}
@@ -523,7 +692,10 @@ void CGOpenMPRuntimeNVPTX::emitParallelCall(
523
692
if (!CGF.HaveInsertPoint ())
524
693
return ;
525
694
526
- emitGenericParallelCall (CGF, Loc, OutlinedFn, CapturedVars, IfCond);
695
+ if (isInSpmdExecutionMode ())
696
+ emitSpmdParallelCall (CGF, Loc, OutlinedFn, CapturedVars, IfCond);
697
+ else
698
+ emitGenericParallelCall (CGF, Loc, OutlinedFn, CapturedVars, IfCond);
527
699
}
528
700
529
701
void CGOpenMPRuntimeNVPTX::emitGenericParallelCall (
@@ -593,3 +765,20 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
593
765
ThenRCG (CGF);
594
766
}
595
767
}
768
+
769
+ void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall (
770
+ CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
771
+ ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
772
+ // Just call the outlined function to execute the parallel region.
773
+ // OutlinedFn(>id, &zero, CapturedStruct);
774
+ //
775
+ // TODO: Do something with IfCond when support for the 'if' clause
776
+ // is added on Spmd target directives.
777
+ llvm::SmallVector<llvm::Value *, 16 > OutlinedFnArgs;
778
+ OutlinedFnArgs.push_back (
779
+ llvm::ConstantPointerNull::get (CGM.Int32Ty ->getPointerTo ()));
780
+ OutlinedFnArgs.push_back (
781
+ llvm::ConstantPointerNull::get (CGM.Int32Ty ->getPointerTo ()));
782
+ OutlinedFnArgs.append (CapturedVars.begin (), CapturedVars.end ());
783
+ CGF.EmitCallOrInvoke (OutlinedFn, OutlinedFnArgs);
784
+ }
0 commit comments