Skip to content
This repository was archived by the owner on Nov 1, 2021. It is now read-only.

Commit 8d6d7ca

Browse files
committed
[OpenMP] Support for the num_threads-clause on 'target parallel' on the NVPTX device.
This patch adds support for the Spmd construct 'target parallel' on the NVPTX device. This involves ignoring the num_threads clause on the device since the number of threads in this combined construct is already set on the host through the call to __tgt_target_teams(). Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D29083 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@292999 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 440f379 commit 8d6d7ca

File tree

3 files changed

+145
-0
lines changed

3 files changed

+145
-0
lines changed

lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

+11
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,17 @@ CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
642642
llvm_unreachable("OpenMP NVPTX can only handle device code.");
643643
}
644644

645+
void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
646+
llvm::Value *NumThreads,
647+
SourceLocation Loc) {
648+
// Do nothing in case of Spmd mode and L0 parallel.
649+
// TODO: If in Spmd mode and L1 parallel emit the clause.
650+
if (isInSpmdExecutionMode())
651+
return;
652+
653+
CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
654+
}
655+
645656
void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
646657
const Expr *NumTeams,
647658
const Expr *ThreadLimit,

lib/CodeGen/CGOpenMPRuntimeNVPTX.h

+8
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,14 @@ class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
170170
public:
171171
explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
172172

173+
/// \brief Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
174+
/// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
175+
/// clause.
176+
/// \param NumThreads An integer value of threads.
177+
virtual void emitNumThreadsClause(CodeGenFunction &CGF,
178+
llvm::Value *NumThreads,
179+
SourceLocation Loc) override;
180+
173181
/// \brief This function ought to emit, in the general case, a call to
174182
// the openmp runtime kmpc_push_num_teams. In NVPTX backend it is not needed
175183
// as these numbers are obtained through the PTX grid and block configuration.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
// Test target codegen - host bc file has to be created first.
2+
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
3+
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
4+
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
5+
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
6+
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
7+
// expected-no-diagnostics
8+
#ifndef HEADER
9+
#define HEADER
10+
11+
// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode.
12+
// CHECK-DAG: {{@__omp_offloading_.+l21}}_exec_mode = weak constant i8 0
13+
// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0
14+
15+
template<typename tx>
16+
tx ftemplate(int n) {
17+
tx a = 0;
18+
short aa = 0;
19+
tx b[10];
20+
21+
#pragma omp target parallel map(tofrom: aa) num_threads(1024)
22+
{
23+
aa += 1;
24+
}
25+
26+
#pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) num_threads(n)
27+
{
28+
a += 1;
29+
aa += 1;
30+
b[2] += 1;
31+
}
32+
33+
return a;
34+
}
35+
36+
int bar(int n){
37+
int a = 0;
38+
39+
a += ftemplate<int>(n);
40+
41+
return a;
42+
}
43+
44+
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l21}}(
45+
// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
46+
// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
47+
// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
48+
// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
49+
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
50+
// CHECK: br label {{%?}}[[EXEC:.+]]
51+
//
52+
// CHECK: [[EXEC]]
53+
// CHECK-NOT: call void @__kmpc_push_num_threads
54+
// CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null, i16* [[AA]])
55+
// CHECK: br label {{%?}}[[DONE:.+]]
56+
//
57+
// CHECK: [[DONE]]
58+
// CHECK: call void @__kmpc_spmd_kernel_deinit()
59+
// CHECK: br label {{%?}}[[EXIT:.+]]
60+
//
61+
// CHECK: [[EXIT]]
62+
// CHECK: ret void
63+
// CHECK: }
64+
65+
// CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]])
66+
// CHECK: = alloca i32*, align
67+
// CHECK: = alloca i32*, align
68+
// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
69+
// CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align
70+
// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
71+
// CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align
72+
// CHECK: store i16 {{%.+}}, i16* [[AA]], align
73+
// CHECK: ret void
74+
// CHECK: }
75+
76+
77+
78+
79+
80+
81+
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}(
82+
// CHECK: [[A_ADDR:%.+]] = alloca i32*, align
83+
// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
84+
// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
85+
// CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align
86+
// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
87+
// CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align
88+
// CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
89+
// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
90+
// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
91+
// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
92+
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
93+
// CHECK: br label {{%?}}[[EXEC:.+]]
94+
//
95+
// CHECK: [[EXEC]]
96+
// CHECK-NOT: call void @__kmpc_push_num_threads
97+
// CHECK: {{call|invoke}} void [[OP2:@.+]](i32* null, i32* null, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]])
98+
// CHECK: br label {{%?}}[[DONE:.+]]
99+
//
100+
// CHECK: [[DONE]]
101+
// CHECK: call void @__kmpc_spmd_kernel_deinit()
102+
// CHECK: br label {{%?}}[[EXIT:.+]]
103+
//
104+
// CHECK: [[EXIT]]
105+
// CHECK: ret void
106+
// CHECK: }
107+
108+
// CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]])
109+
// CHECK: = alloca i32*, align
110+
// CHECK: = alloca i32*, align
111+
// CHECK: [[A_ADDR:%.+]] = alloca i32*, align
112+
// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
113+
// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
114+
// CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align
115+
// CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align
116+
// CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align
117+
// CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
118+
// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
119+
// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
120+
// CHECK: store i32 {{%.+}}, i32* [[A]], align
121+
// CHECK: store i16 {{%.+}}, i16* [[AA]], align
122+
// CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]],
123+
// CHECK: store i32 {{%.+}}, i32* [[ELT]], align
124+
// CHECK: ret void
125+
// CHECK: }
126+
#endif

0 commit comments

Comments
 (0)