Skip to content
This repository was archived by the owner on Nov 1, 2021. It is now read-only.

Commit 5006cc1

Browse files
committed
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device.
This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@292428 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 2e52b0f commit 5006cc1

File tree

4 files changed

+414
-21
lines changed

4 files changed

+414
-21
lines changed

lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

+198-9
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ enum OpenMPRTLFunctionNVPTX {
2626
OMPRTL_NVPTX__kmpc_kernel_init,
2727
/// \brief Call to void __kmpc_kernel_deinit();
2828
OMPRTL_NVPTX__kmpc_kernel_deinit,
29+
/// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
30+
/// short RequiresOMPRuntime, short RequiresDataSharing);
31+
OMPRTL_NVPTX__kmpc_spmd_kernel_init,
32+
/// \brief Call to void __kmpc_spmd_kernel_deinit();
33+
OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
2934
/// \brief Call to void __kmpc_kernel_prepare_parallel(void
3035
/// *outlined_function);
3136
OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
@@ -76,6 +81,25 @@ class NVPTXActionTy final : public PrePostActionTy {
7681
CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
7782
}
7883
};
84+
85+
// A class to track the execution mode when codegening directives within
86+
// a target region. The appropriate mode (generic/spmd) is set on entry
87+
// to the target region and used by containing directives such as 'parallel'
88+
// to emit optimized code.
89+
class ExecutionModeRAII {
90+
private:
91+
CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
92+
CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
93+
94+
public:
95+
ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
96+
CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
97+
: Mode(Mode) {
98+
SavedMode = Mode;
99+
Mode = NewMode;
100+
}
101+
~ExecutionModeRAII() { Mode = SavedMode; }
102+
};
79103
} // anonymous namespace
80104

81105
/// Get the GPU warp size.
@@ -116,12 +140,17 @@ static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
116140
static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
117141

118142
/// Get the value of the thread_limit clause in the teams directive.
119-
/// The runtime encodes thread_limit in the launch parameter, always starting
120-
/// thread_limit+warpSize threads per team.
121-
static llvm::Value *getThreadLimit(CodeGenFunction &CGF) {
143+
/// For the 'generic' execution mode, the runtime encodes thread_limit in
144+
/// the launch parameters, always starting thread_limit+warpSize threads per
145+
/// CTA. The threads in the last warp are reserved for master execution.
146+
/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
147+
static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
148+
bool IsInSpmdExecutionMode = false) {
122149
CGBuilderTy &Bld = CGF.Builder;
123-
return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
124-
"thread_limit");
150+
return IsInSpmdExecutionMode
151+
? getNVPTXNumThreads(CGF)
152+
: Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
153+
"thread_limit");
125154
}
126155

127156
/// Get the thread id of the OMP master thread.
@@ -159,12 +188,33 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
159188
CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
160189
}
161190

191+
bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
192+
return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
193+
}
194+
195+
static CGOpenMPRuntimeNVPTX::ExecutionMode
196+
getExecutionModeForDirective(CodeGenModule &CGM,
197+
const OMPExecutableDirective &D) {
198+
OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
199+
switch (DirectiveKind) {
200+
case OMPD_target:
201+
return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
202+
case OMPD_target_parallel:
203+
return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
204+
default:
205+
llvm_unreachable("Unsupported directive on NVPTX device.");
206+
}
207+
llvm_unreachable("Unsupported directive on NVPTX device.");
208+
}
209+
162210
void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
163211
StringRef ParentName,
164212
llvm::Function *&OutlinedFn,
165213
llvm::Constant *&OutlinedFnID,
166214
bool IsOffloadEntry,
167215
const RegionCodeGenTy &CodeGen) {
216+
ExecutionModeRAII ModeRAII(CurrentExecutionMode,
217+
CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
168218
EntryFunctionState EST;
169219
WorkerFunctionState WST(CGM);
170220
Work.clear();
@@ -252,6 +302,94 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
252302
EST.ExitBB = nullptr;
253303
}
254304

305+
void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
306+
StringRef ParentName,
307+
llvm::Function *&OutlinedFn,
308+
llvm::Constant *&OutlinedFnID,
309+
bool IsOffloadEntry,
310+
const RegionCodeGenTy &CodeGen) {
311+
ExecutionModeRAII ModeRAII(CurrentExecutionMode,
312+
CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
313+
EntryFunctionState EST;
314+
315+
// Emit target region as a standalone region.
316+
class NVPTXPrePostActionTy : public PrePostActionTy {
317+
CGOpenMPRuntimeNVPTX &RT;
318+
CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
319+
const OMPExecutableDirective &D;
320+
321+
public:
322+
NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
323+
CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
324+
const OMPExecutableDirective &D)
325+
: RT(RT), EST(EST), D(D) {}
326+
void Enter(CodeGenFunction &CGF) override {
327+
RT.emitSpmdEntryHeader(CGF, EST, D);
328+
}
329+
void Exit(CodeGenFunction &CGF) override {
330+
RT.emitSpmdEntryFooter(CGF, EST);
331+
}
332+
} Action(*this, EST, D);
333+
CodeGen.setAction(Action);
334+
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
335+
IsOffloadEntry, CodeGen);
336+
return;
337+
}
338+
339+
void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
340+
CodeGenFunction &CGF, EntryFunctionState &EST,
341+
const OMPExecutableDirective &D) {
342+
auto &Bld = CGF.Builder;
343+
344+
// Setup BBs in entry function.
345+
llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
346+
EST.ExitBB = CGF.createBasicBlock(".exit");
347+
348+
// Initialize the OMP state in the runtime; called by all active threads.
349+
// TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
350+
// based on code analysis of the target region.
351+
llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
352+
/*RequiresOMPRuntime=*/Bld.getInt16(1),
353+
/*RequiresDataSharing=*/Bld.getInt16(1)};
354+
CGF.EmitRuntimeCall(
355+
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
356+
CGF.EmitBranch(ExecuteBB);
357+
358+
CGF.EmitBlock(ExecuteBB);
359+
}
360+
361+
void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
362+
EntryFunctionState &EST) {
363+
if (!EST.ExitBB)
364+
EST.ExitBB = CGF.createBasicBlock(".exit");
365+
366+
llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
367+
CGF.EmitBranch(OMPDeInitBB);
368+
369+
CGF.EmitBlock(OMPDeInitBB);
370+
// DeInitialize the OMP state in the runtime; called by all active threads.
371+
CGF.EmitRuntimeCall(
372+
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
373+
CGF.EmitBranch(EST.ExitBB);
374+
375+
CGF.EmitBlock(EST.ExitBB);
376+
EST.ExitBB = nullptr;
377+
}
378+
379+
// Create a unique global variable to indicate the execution mode of this target
380+
// region. The execution mode is either 'generic', or 'spmd' depending on the
381+
// target directive. This variable is picked up by the offload library to setup
382+
// the device appropriately before kernel launch. If the execution mode is
383+
// 'generic', the runtime reserves one warp for the master, otherwise, all
384+
// warps participate in parallel work.
385+
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
386+
CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
387+
(void)new llvm::GlobalVariable(
388+
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
389+
llvm::GlobalValue::WeakAnyLinkage,
390+
llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
391+
}
392+
255393
void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
256394
auto &Ctx = CGM.getContext();
257395

@@ -385,6 +523,22 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
385523
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
386524
break;
387525
}
526+
case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
527+
// Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
528+
// short RequiresOMPRuntime, short RequiresDataSharing);
529+
llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
530+
llvm::FunctionType *FnTy =
531+
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
532+
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
533+
break;
534+
}
535+
case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
536+
// Build void __kmpc_spmd_kernel_deinit();
537+
llvm::FunctionType *FnTy =
538+
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
539+
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
540+
break;
541+
}
388542
case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
389543
/// Build void __kmpc_kernel_prepare_parallel(
390544
/// void *outlined_function);
@@ -463,12 +617,27 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
463617

464618
assert(!ParentName.empty() && "Invalid target region parent name!");
465619

466-
emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
467-
CodeGen);
620+
CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
621+
getExecutionModeForDirective(CGM, D);
622+
switch (Mode) {
623+
case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
624+
emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
625+
CodeGen);
626+
break;
627+
case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
628+
emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
629+
CodeGen);
630+
break;
631+
case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
632+
llvm_unreachable(
633+
"Unknown programming model for OpenMP directive on NVPTX target.");
634+
}
635+
636+
setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
468637
}
469638

470639
CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
471-
: CGOpenMPRuntime(CGM) {
640+
: CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
472641
if (!CGM.getLangOpts().OpenMPIsDevice)
473642
llvm_unreachable("OpenMP NVPTX can only handle device code.");
474643
}
@@ -523,7 +692,10 @@ void CGOpenMPRuntimeNVPTX::emitParallelCall(
523692
if (!CGF.HaveInsertPoint())
524693
return;
525694

526-
emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
695+
if (isInSpmdExecutionMode())
696+
emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
697+
else
698+
emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
527699
}
528700

529701
void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
@@ -593,3 +765,20 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
593765
ThenRCG(CGF);
594766
}
595767
}
768+
769+
void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
770+
CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
771+
ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
772+
// Just call the outlined function to execute the parallel region.
773+
// OutlinedFn(&GTid, &zero, CapturedStruct);
774+
//
775+
// TODO: Do something with IfCond when support for the 'if' clause
776+
// is added on Spmd target directives.
777+
llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
778+
OutlinedFnArgs.push_back(
779+
llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
780+
OutlinedFnArgs.push_back(
781+
llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
782+
OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
783+
CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
784+
}

lib/CodeGen/CGOpenMPRuntimeNVPTX.h

+60
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
4343
void createWorkerFunction(CodeGenModule &CGM);
4444
};
4545

46+
bool isInSpmdExecutionMode() const;
47+
4648
/// \brief Emit the worker function for the current target region.
4749
void emitWorkerFunction(WorkerFunctionState &WST);
4850

@@ -58,6 +60,13 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
5860
/// function.
5961
void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
6062

63+
/// \brief Helper for Spmd mode target directive's entry function.
64+
void emitSpmdEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
65+
const OMPExecutableDirective &D);
66+
67+
/// \brief Signal termination of Spmd mode execution.
68+
void emitSpmdEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
69+
6170
/// \brief Returns specified OpenMP runtime function for the current OpenMP
6271
/// implementation. Specialized for the NVPTX device.
6372
/// \param Function OpenMP runtime function.
@@ -87,6 +96,22 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
8796
llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
8897
const RegionCodeGenTy &CodeGen);
8998

99+
/// \brief Emit outlined function specialized for the Single Program
100+
/// Multiple Data programming model for applicable target directives on the
101+
/// NVPTX device.
102+
/// \param D Directive to emit.
103+
/// \param ParentName Name of the function that encloses the target region.
104+
/// \param OutlinedFn Outlined function value to be defined by this call.
105+
/// \param OutlinedFnID Outlined function ID value to be defined by this call.
106+
/// \param IsOffloadEntry True if the outlined function is an offload entry.
107+
/// \param CodeGen Object containing the target statements.
108+
/// An outlined function may not be an entry if, e.g. the if clause always
109+
/// evaluates to false.
110+
void emitSpmdKernel(const OMPExecutableDirective &D, StringRef ParentName,
111+
llvm::Function *&OutlinedFn,
112+
llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
113+
const RegionCodeGenTy &CodeGen);
114+
90115
/// \brief Emit outlined function for 'target' directive on the NVPTX
91116
/// device.
92117
/// \param D Directive to emit.
@@ -118,6 +143,22 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
118143
ArrayRef<llvm::Value *> CapturedVars,
119144
const Expr *IfCond);
120145

146+
/// \brief Emits code for parallel or serial call of the \a OutlinedFn with
147+
/// variables captured in a record which address is stored in \a
148+
/// CapturedStruct.
149+
/// This call is for a parallel directive within an SPMD target directive.
150+
/// \param OutlinedFn Outlined function to be run in parallel threads. Type of
151+
/// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
152+
/// \param CapturedVars A pointer to the record with the references to
153+
/// variables used in \a OutlinedFn function.
154+
/// \param IfCond Condition in the associated 'if' clause, if it was
155+
/// specified, nullptr otherwise.
156+
///
157+
void emitSpmdParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
158+
llvm::Value *OutlinedFn,
159+
ArrayRef<llvm::Value *> CapturedVars,
160+
const Expr *IfCond);
161+
121162
protected:
122163
/// \brief Get the function name of an outlined region.
123164
// The name can be customized depending on the target.
@@ -192,6 +233,25 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
192233
llvm::Value *OutlinedFn,
193234
ArrayRef<llvm::Value *> CapturedVars,
194235
const Expr *IfCond) override;
236+
237+
public:
238+
/// Target codegen is specialized based on two programming models: the
239+
/// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd'
240+
/// model for constructs like 'target parallel' that support it.
241+
enum ExecutionMode {
242+
/// Single Program Multiple Data.
243+
Spmd,
244+
/// Generic codegen to support fork-join model.
245+
Generic,
246+
Unknown,
247+
};
248+
249+
private:
250+
// Track the execution mode when codegening directives within a target
251+
// region. The appropriate mode (generic/spmd) is set on entry to the
252+
// target region and used by containing directives such as 'parallel'
253+
// to emit optimized code.
254+
ExecutionMode CurrentExecutionMode;
195255
};
196256

197257
} // CodeGen namespace.

0 commit comments

Comments
 (0)