@@ -4894,19 +4894,29 @@ static const Stmt *ignoreCompoundStmts(const Stmt *Body) {
4894
4894
return Body;
4895
4895
}
4896
4896
4897
- /// \brief Emit the num_teams clause of an enclosed teams directive at the
4898
- /// target region scope. If there is no teams directive associated with the
4899
- /// target directive, or if there is no num_teams clause associated with the
4900
- /// enclosed teams directive, return nullptr.
4897
+ /// Emit the number of teams for a target directive. Inspect the num_teams
4898
+ /// clause associated with a teams construct combined or closely nested
4899
+ /// with the target directive.
4900
+ ///
4901
+ /// Emit a team of size one for directives such as 'target parallel' that
4902
+ /// have no associated teams construct.
4903
+ ///
4904
+ /// Otherwise, return nullptr.
4901
4905
static llvm::Value *
4902
- emitNumTeamsClauseForTargetDirective (CGOpenMPRuntime &OMPRuntime,
4903
- CodeGenFunction &CGF,
4904
- const OMPExecutableDirective &D) {
4906
+ emitNumTeamsForTargetDirective (CGOpenMPRuntime &OMPRuntime,
4907
+ CodeGenFunction &CGF,
4908
+ const OMPExecutableDirective &D) {
4905
4909
4906
4910
assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the "
4907
4911
"teams directive expected to be "
4908
4912
"emitted only for the host!");
4909
4913
4914
+ // If the target directive is combined with a parallel directive but not a
4915
+ // teams directive, start one team.
4916
+ if (isOpenMPParallelDirective(D.getDirectiveKind()) &&
4917
+ !isOpenMPTeamsDirective(D.getDirectiveKind()))
4918
+ return CGF.Builder.getInt32(1);
4919
+
4910
4920
// FIXME: For the moment we do not support combined directives with target and
4911
4921
// teams, so we do not expect to get any num_teams clause in the provided
4912
4922
// directive. Once we support that, this assertion can be replaced by the
@@ -4943,19 +4953,56 @@ emitNumTeamsClauseForTargetDirective(CGOpenMPRuntime &OMPRuntime,
4943
4953
return nullptr;
4944
4954
}
4945
4955
4946
- /// \brief Emit the thread_limit clause of an enclosed teams directive at the
4947
- /// target region scope. If there is no teams directive associated with the
4948
- /// target directive, or if there is no thread_limit clause associated with the
4949
- /// enclosed teams directive, return nullptr.
4956
+ /// Emit the number of threads for a target directive. Inspect the
4957
+ /// thread_limit clause associated with a teams construct combined or closely
4958
+ /// nested with the target directive.
4959
+ ///
4960
+ /// Emit the num_threads clause for directives such as 'target parallel' that
4961
+ /// have no associated teams construct.
4962
+ ///
4963
+ /// Otherwise, return nullptr.
4950
4964
static llvm::Value *
4951
- emitThreadLimitClauseForTargetDirective (CGOpenMPRuntime &OMPRuntime,
4952
- CodeGenFunction &CGF,
4953
- const OMPExecutableDirective &D) {
4965
+ emitNumThreadsForTargetDirective (CGOpenMPRuntime &OMPRuntime,
4966
+ CodeGenFunction &CGF,
4967
+ const OMPExecutableDirective &D) {
4954
4968
4955
4969
assert(!CGF.getLangOpts().OpenMPIsDevice && "Clauses associated with the "
4956
4970
"teams directive expected to be "
4957
4971
"emitted only for the host!");
4958
4972
4973
+ auto &Bld = CGF.Builder;
4974
+
4975
+ //
4976
+ // If the target directive is combined with a teams directive:
4977
+ // Return the value in the thread_limit clause, if any.
4978
+ //
4979
+ // If the target directive is combined with a parallel directive:
4980
+ // Return the value in the num_threads clause, if any.
4981
+ //
4982
+ // If both clauses are set, select the minimum of the two.
4983
+ //
4984
+ // If neither teams or parallel combined directives set the number of threads
4985
+ // in a team, return 0 to denote the runtime default.
4986
+ //
4987
+ // If this is not a teams directive return nullptr.
4988
+
4989
+ if (isOpenMPParallelDirective(D.getDirectiveKind())) {
4990
+ llvm::Value *DefaultThreadLimitVal = Bld.getInt32(0);
4991
+ llvm::Value *NumThreadsVal = nullptr;
4992
+
4993
+ if (const auto *NumThreadsClause =
4994
+ D.getSingleClause<OMPNumThreadsClause>()) {
4995
+ CodeGenFunction::RunCleanupsScope NumThreadsScope(CGF);
4996
+ llvm::Value *NumThreads =
4997
+ CGF.EmitScalarExpr(NumThreadsClause->getNumThreads(),
4998
+ /*IgnoreResultAssign*/ true);
4999
+ NumThreadsVal =
5000
+ Bld.CreateIntCast(NumThreads, CGF.Int32Ty, /*IsSigned=*/true);
5001
+ }
5002
+
5003
+ return NumThreadsVal ? NumThreadsVal : DefaultThreadLimitVal;
5004
+ }
5005
+
4959
5006
// FIXME: For the moment we do not support combined directives with target and
4960
5007
// teams, so we do not expect to get any thread_limit clause in the provided
4961
5008
// directive. Once we support that, this assertion can be replaced by the
@@ -6041,24 +6088,50 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
6041
6088
// Return value of the runtime offloading call.
6042
6089
llvm::Value *Return;
6043
6090
6044
- auto *NumTeams = emitNumTeamsClauseForTargetDirective (RT, CGF, D);
6045
- auto *ThreadLimit = emitThreadLimitClauseForTargetDirective (RT, CGF, D);
6091
+ auto *NumTeams = emitNumTeamsForTargetDirective (RT, CGF, D);
6092
+ auto *NumThreads = emitNumThreadsForTargetDirective (RT, CGF, D);
6046
6093
6047
- // If we have NumTeams defined this means that we have an enclosed teams
6048
- // region. Therefore we also expect to have ThreadLimit defined. These two
6049
- // values should be defined in the presence of a teams directive, regardless
6050
- // of having any clauses associated. If the user is using teams but no
6051
- // clauses, these two values will be the default that should be passed to
6052
- // the runtime library - a 32-bit integer with the value zero.
6094
+ // The target region is an outlined function launched by the runtime
6095
+ // via calls __tgt_target() or __tgt_target_teams().
6096
+ //
6097
+ // __tgt_target() launches a target region with one team and one thread,
6098
+ // executing a serial region. This master thread may in turn launch
6099
+ // more threads within its team upon encountering a parallel region,
6100
+ // however, no additional teams can be launched on the device.
6101
+ //
6102
+ // __tgt_target_teams() launches a target region with one or more teams,
6103
+ // each with one or more threads. This call is required for target
6104
+ // constructs such as:
6105
+ // 'target teams'
6106
+ // 'target' / 'teams'
6107
+ // 'target teams distribute parallel for'
6108
+ // 'target parallel'
6109
+ // and so on.
6110
+ //
6111
+ // Note that on the host and CPU targets, the runtime implementation of
6112
+ // these calls simply call the outlined function without forking threads.
6113
+ // The outlined functions themselves have runtime calls to
6114
+ // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
6115
+ // the compiler in emitTeamsCall() and emitParallelCall().
6116
+ //
6117
+ // In contrast, on the NVPTX target, the implementation of
6118
+ // __tgt_target_teams() launches a GPU kernel with the requested number
6119
+ // of teams and threads so no additional calls to the runtime are required.
6053
6120
if (NumTeams) {
6054
- assert(ThreadLimit && "Thread limit expression should be available along "
6055
- "with number of teams.");
6121
+ // If we have NumTeams defined this means that we have an enclosed teams
6122
+ // region. Therefore we also expect to have NumThreads defined. These two
6123
+ // values should be defined in the presence of a teams directive,
6124
+ // regardless of having any clauses associated. If the user is using teams
6125
+ // but no clauses, these two values will be the default that should be
6126
+ // passed to the runtime library - a 32-bit integer with the value zero.
6127
+ assert(NumThreads && "Thread limit expression should be available along "
6128
+ "with number of teams.");
6056
6129
llvm::Value *OffloadingArgs[] = {
6057
6130
DeviceID, OutlinedFnID,
6058
6131
PointerNum, Info.BasePointersArray,
6059
6132
Info.PointersArray, Info.SizesArray,
6060
6133
Info.MapTypesArray, NumTeams,
6061
- ThreadLimit };
6134
+ NumThreads };
6062
6135
Return = CGF.EmitRuntimeCall(
6063
6136
RT.createRuntimeFunction(OMPRTL__tgt_target_teams), OffloadingArgs);
6064
6137
} else {
0 commit comments