Skip to content

Commit 19727e3

Browse files
committed
[AMDGPU] Enable divergence predicates for ctlz/cttz
ctlz/cttz get lowered to the set of target opcodes This change enables the ISel to select SALU or VALU form according to the SDNode divergence. CTLZ - S_FLBIT_I32_B32 if uniform and V_FFBH_U32_e64 if divergent CTTZ - S_FF1_I32_B32 if uniform and V_FFBL_B32_e64 if divergent Also @llvm.amdgcn.sffbh.i32 gets lowered to S_FLBIT_I32 if uniform and V_FFBH_I32_e64 if divergent NOTE: 64bit versions S_FF1_I32_B64 and S_FLBIT_I32_B64 are not currently supported by the DAG ISel. ctlz/cttz with i64 input are split into two 32bit instructions. Nevertheless, they already have the patterns and were equipped with the divergence predicates to make sure they will be selected correctly when enabled. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D116044
1 parent 4fe5543 commit 19727e3

File tree

2 files changed

+69
-5
lines changed

2 files changed

+69
-5
lines changed

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -257,22 +257,22 @@ let isReMaterializable = 1 in {
257257
def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
258258
def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
259259
def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
260-
[(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))]
260+
[(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i64:$src0))]
261261
>;
262262

263263
def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
264-
[(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))]
264+
[(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i32:$src0))]
265265
>;
266266

267267
def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
268-
[(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
268+
[(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i32:$src0))]
269269
>;
270270

271271
def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64",
272-
[(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))]
272+
[(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i64:$src0))]
273273
>;
274274
def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
275-
[(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
275+
[(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_i32> i32:$src0))]
276276
>;
277277
def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
278278
def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s
2+
3+
; GCN-LABEL: name: s_ctlz_i32
4+
; GCN: S_FLBIT_I32_B32
5+
define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
6+
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
7+
store i32 %ctlz, i32 addrspace(1)* %out, align 4
8+
ret void
9+
}
10+
; GCN-LABEL: name: v_ctlz_i32
11+
; GCN: V_FFBH_U32_e64
12+
define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
13+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
14+
%in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
15+
%val = load i32, i32 addrspace(1)* %in.gep, align 4
16+
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
17+
store i32 %ctlz, i32 addrspace(1)* %out, align 4
18+
ret void
19+
}
20+
21+
; GCN-LABEL: name: s_cttz_i32
22+
; GCN: S_FF1_I32_B32
23+
define amdgpu_kernel void @s_cttz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
24+
%cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
25+
store i32 %cttz, i32 addrspace(1)* %out, align 4
26+
ret void
27+
}
28+
29+
; GCN-LABEL: name: v_cttz_i32
30+
; GCN: V_FFBL_B32_e64
31+
define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
32+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
33+
%in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
34+
%val = load i32, i32 addrspace(1)* %in.gep, align 4
35+
%cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
36+
store i32 %cttz, i32 addrspace(1)* %out, align 4
37+
ret void
38+
}
39+
40+
; GCN-LABEL: name: s_flbit
41+
; GCN: S_FLBIT_I32
42+
define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
43+
%r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
44+
store i32 %r, i32 addrspace(1)* %out, align 4
45+
ret void
46+
}
47+
48+
; GCN-LABEL: name: v_flbit
49+
; GCN: V_FFBH_I32_e64
50+
define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
51+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
52+
%in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
53+
%val = load i32, i32 addrspace(1)* %in.gep, align 4
54+
%r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
55+
store i32 %r, i32 addrspace(1)* %out, align 4
56+
ret void
57+
}
58+
59+
60+
declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
61+
declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
62+
declare i32 @llvm.amdgcn.sffbh.i32(i32)
63+
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
64+

0 commit comments

Comments
 (0)